From be212d86b19c0d83bceeb1bec0805fd69cebf95c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Feb 2025 15:03:38 -0500 Subject: [PATCH 001/180] bcachefs: bs > ps support bcachefs removed most PAGE_SIZE references long ago, so this is easy; only readpage_bio_extend() has to be tweaked to respect the minimum order. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 12 +++++++++++- fs/bcachefs/fs.c | 3 ++- fs/bcachefs/super-io.c | 9 --------- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index ab1d5db2fa56..d9a360782946 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -110,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans, if (!get_more) break; + unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); + + if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) + break; + + unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); + + /* ensure proper alignment */ + order = min(order, __ffs(folio_offset|BIT(31))); + folio = xa_load(&iter->mapping->i_pages, folio_offset); if (folio && !xa_is_value(folio)) break; - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); if (!folio) break; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 90ade8f648d9..5d910f1c671c 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1802,7 +1802,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, break; } - mapping_set_large_folios(inode->v.i_mapping); + mapping_set_folio_min_order(inode->v.i_mapping, + get_order(trans->c->opts.block_size)); } static void bch2_free_inode(struct inode *vinode) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index a81a7b6c0989..5bd7bb90ee48 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -372,7 +372,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct bch_sb *sb = disk_sb->sb; struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; - u16 block_size; int ret; ret = bch2_sb_compatible(sb, out); @@ -391,14 +390,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, return -BCH_ERR_invalid_sb_features; } - block_size = le16_to_cpu(sb->block_size); - - if (block_size > PAGE_SECTORS) { - prt_printf(out, "Block size too big (got %u, max %u)", - block_size, PAGE_SECTORS); - return -BCH_ERR_invalid_sb_block_size; - } - if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { prt_printf(out, "Bad user UUID (got zeroes)"); return -BCH_ERR_invalid_sb_uuid; From 2deae558043392e76ec64642e8fd0db3cce987a3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 25 Feb 2025 15:04:58 -0500 Subject: [PATCH 002/180] bcachefs: btree_node_(rewrite|update_key) cleanup Factor out get_iter_to_node() and use it for btree_node_rewrite_get_iter(), to be used for fixing btree node write error behaviour. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 126 ++++++++++++++-------------- fs/bcachefs/btree_update_interior.h | 3 + fs/bcachefs/errcode.h | 1 + 3 files changed, 68 insertions(+), 62 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index e4e7c804625e..05aa9e32adf4 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2126,6 +2126,31 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, goto out; } +static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b) +{ + bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, + BTREE_ITER_intent); + int ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + + /* has node been freed? */ + if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + ret = -BCH_ERR_btree_node_dying; + goto err; + } + + BUG_ON(!btree_node_hashed(b)); + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, @@ -2191,6 +2216,41 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, goto out; } +static int bch2_btree_node_rewrite_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_i *k, unsigned flags) +{ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, + btree, k->k.p, + BTREE_MAX_DEPTH, level, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto out; + + bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); + ret = found + ? bch2_btree_node_rewrite(trans, &iter, b, flags) + : -ENOENT; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, + struct btree *b, unsigned flags) +{ + struct btree_iter iter; + int ret = get_iter_to_node(trans, &iter, b); + if (ret) + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; + + ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + struct async_btree_rewrite { struct bch_fs *c; struct work_struct work; @@ -2200,57 +2260,14 @@ struct async_btree_rewrite { struct bkey_buf key; }; -static int async_btree_node_rewrite_trans(struct btree_trans *trans, - struct async_btree_rewrite *a) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, - a->btree_id, a->key.k->k.p, - BTREE_MAX_DEPTH, a->level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto out; - - bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); - ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, 0) - : -ENOENT; - -#if 0 - /* Tracepoint... */ - if (!ret || ret == -ENOENT) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - if (!ret) { - prt_printf(&buf, "rewrite node:\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); - } else { - prt_printf(&buf, "node to rewrite not found:\n want: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); - prt_printf(&buf, "\n got: "); - if (b) - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - else - prt_str(&buf, "(null)"); - } - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -#endif -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static void async_btree_node_rewrite_work(struct work_struct *work) { struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; - int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); + int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, + a->btree_id, a->level, a->key.k, 0)); if (ret != -ENOENT) bch_err_fn_ratelimited(c, ret); @@ -2494,30 +2511,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, unsigned commit_flags, bool skip_triggers) { struct btree_iter iter; - int ret; - - bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, - BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter); + int ret = get_iter_to_node(trans, &iter, b); if (ret) - goto out; - - /* has node been freed? */ - if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { - /* node has been freed: */ - BUG_ON(!btree_node_dying(b)); - goto out; - } - - BUG_ON(!btree_node_hashed(b)); + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); ret = bch2_btree_node_update_key(trans, &iter, b, new_key, commit_flags, skip_triggers); -out: bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 26d646e1275c..9261a9a341fb 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -169,6 +169,9 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, struct btree *, unsigned); +int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, + struct btree *, unsigned); + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, struct btree *, struct bkey_i *, diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 4590cd0c7c90..712877036612 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -119,6 +119,7 @@ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOENT, ENOENT_inode_no_backpointer) \ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ + x(ENOENT, btree_node_dying) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(EEXIST, EEXIST_str_hash_set) \ From c3c9957c818f1cb2de2865f223fad80afa28ffad Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Feb 2025 13:33:08 -0500 Subject: [PATCH 003/180] bcachefs: check_bp_exists() check for backpointers for stale pointers Early version of 'bcachefs_metadata_version_cached_backpointers' was creating backpointers for stale cached pointers - whoops. Now we have to repair those. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ebeb6a5ff9d2..1d30066e63dc 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -514,6 +514,22 @@ static int check_bp_exists(struct btree_trans *trans, if (!other_extent.k) goto missing; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); + if (ca) { + struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); + bkey_for_each_ptr(other_extent_ptrs, ptr) + if (ptr->dev == bp->k.p.inode && + dev_ptr_stale_rcu(ca, ptr)) { + ret = drop_dev_and_update(trans, other_bp.v->btree_id, + other_extent, bp->k.p.inode); + if (ret) + goto err; + goto out; + } + } + rcu_read_unlock(); + if (bch2_extents_match(orig_k, other_extent)) { printbuf_reset(&buf); prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); From e5a63ad343cc19c64875f2496ce5f7b992ef0c32 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 29 Jan 2025 15:51:37 -0500 Subject: [PATCH 004/180] bcachefs: Fix missing increment of move_extent_write counter Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 642fbc60ecab..e2050256136e 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -436,6 +436,8 @@ void bch2_data_update_read_done(struct data_update *m, m->op.crc = crc; m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + this_cpu_add(m->op.c->counters[BCH_COUNTER_move_extent_write], m->k.k->k.size); + closure_call(&m->op.cl, bch2_write, NULL, NULL); } From 55a132c37acded02391664c108fcea5006c72fd5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 30 Jan 2025 03:41:31 -0500 Subject: [PATCH 005/180] bcachefs: Don't inc io_(read|write) counters for moves This makes 'bcachefs fs top' more useful; we can now see at a glance whether the IO to the device is being done for user reads/writes, or copygc/rebalance. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 3 ++- fs/bcachefs/io_write.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index aa91fcf51eec..b2b0280d8365 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1091,7 +1091,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (rbio->bounce) trace_and_count(c, read_bounce, &rbio->bio); - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + if (!(flags & BCH_READ_NODECODE)) + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); /* diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 03892388832b..970f3f0959a4 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1644,7 +1644,8 @@ CLOSURE_CALLBACK(bch2_write) goto err; } - this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); + if (!(op->flags & BCH_WRITE_MOVE)) + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); data_len = min_t(u64, bio->bi_iter.bi_size, From 78c9c6f6cd25c2aabc29aa8129996031fdfeb31b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 22 Jan 2025 12:07:54 -0500 Subject: [PATCH 006/180] bcachefs: Move write_points to debugfs this was hitting the sysfs 4k limit Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 32 +++++++++++++++++++++++++++++--- fs/bcachefs/sysfs.c | 5 ----- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 55333e82d1fe..03a3b62d19a9 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -7,6 +7,7 @@ */ #include "bcachefs.h" +#include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_io.h" @@ -844,8 +845,11 @@ static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) seqmutex_unlock(&c->btree_trans_lock); } -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); + +static ssize_t bch2_simple_print(struct file *file, char __user *buf, + size_t size, loff_t *ppos, + fs_to_text_fn fn) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; @@ -856,7 +860,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, i->ret = 0; if (!i->iter) { - btree_deadlock_to_text(&i->buf, c); + fn(&i->buf, c); i->iter++; } @@ -869,6 +873,12 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, return ret ?: i->ret; } +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); +} + static const struct file_operations btree_deadlock_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, @@ -876,6 +886,19 @@ static const struct file_operations btree_deadlock_ops = { .read = bch2_btree_deadlock_read, }; +static ssize_t bch2_write_points_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); +} + +static const struct file_operations write_points_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_write_points_read, +}; + void bch2_fs_debug_exit(struct bch_fs *c) { if (!IS_ERR_OR_NULL(c->fs_debug_dir)) @@ -927,6 +950,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, c->btree_debug, &btree_deadlock_ops); + debugfs_create_file("write_points", 0400, c->fs_debug_dir, + c->btree_debug, &write_points_ops); + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index a7eb1f511484..b3f2c651c1f8 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -176,7 +176,6 @@ read_attribute(btree_reserve_cache); read_attribute(stripes_heap); read_attribute(open_buckets); read_attribute(open_buckets_partial); -read_attribute(write_points); read_attribute(nocow_lock_table); #ifdef BCH_WRITE_REF_DEBUG @@ -364,9 +363,6 @@ SHOW(bch2_fs) if (attr == &sysfs_open_buckets_partial) bch2_open_buckets_partial_to_text(out, c); - if (attr == &sysfs_write_points) - bch2_write_points_to_text(out, c); - if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); @@ -569,7 +565,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_stripes_heap, &sysfs_open_buckets, &sysfs_open_buckets_partial, - &sysfs_write_points, #ifdef BCH_WRITE_REF_DEBUG &sysfs_write_refs, #endif From 999cc1bb6888c484fbd5dd9c8397c200772bd243 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 22 Jan 2025 14:38:59 -0500 Subject: [PATCH 007/180] bcachefs: Separate running/runnable in wp stats We've got per-writepoint statistics to see how well the writepoint index update threads are pipelining; this separates running vs. runnable so we can see at a glance if they're blocking. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_types.h | 2 ++ fs/bcachefs/io_write.c | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 4aa8ee026cb8..8f79f46c2a78 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -90,6 +90,7 @@ struct dev_stripe_state { x(stopped) \ x(waiting_io) \ x(waiting_work) \ + x(runnable) \ x(running) enum write_point_state { @@ -125,6 +126,7 @@ struct write_point { enum write_point_state state; u64 last_state_change; u64 time[WRITE_POINT_STATE_NR]; + u64 last_runtime; } __aligned(SMP_CACHE_BYTES); }; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 970f3f0959a4..a903f39caa3e 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -587,7 +587,15 @@ static void __bch2_write_index(struct bch_write_op *op) static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) { if (state != wp->state) { + struct task_struct *p = current; u64 now = ktime_get_ns(); + u64 runtime = p->se.sum_exec_runtime + + (now - p->se.exec_start); + + if (state == WRITE_POINT_runnable) + wp->last_runtime = runtime; + else if (wp->state == WRITE_POINT_runnable) + wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; if (wp->last_state_change && time_after64(now, wp->last_state_change)) @@ -601,7 +609,7 @@ static inline void wp_update_state(struct write_point *wp, bool running) { enum write_point_state state; - state = running ? WRITE_POINT_running : + state = running ? WRITE_POINT_runnable: !list_empty(&wp->writes) ? WRITE_POINT_waiting_io : WRITE_POINT_stopped; From bbd804f2ad3034ac1053b9188e6bb3d705570e25 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 30 Jan 2025 03:28:27 -0500 Subject: [PATCH 008/180] bcachefs: enum bch_persistent_counters_stable Persistent counters, like recovery passes, include a stable enum in their definition - but this was never correctly plumbed. This allows us to add new counters and properly organize them with a non-stable "presentation order", which can also be used in userspace by the new 'bcachefs fs top' tool. Fortunatel, since we haven't yet added any new counters where presentation order ID doesn't match stable ID, this won't cause any reordering issues. Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-counters.c | 51 ++++++++++++++++++-------------- fs/bcachefs/sb-counters_format.h | 7 +++++ 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c index 6992e7469112..5153a47ec7d4 100644 --- a/fs/bcachefs/sb-counters.c +++ b/fs/bcachefs/sb-counters.c @@ -5,11 +5,10 @@ /* BCH_SB_FIELD_counters */ -static const char * const bch2_counter_names[] = { -#define x(t, n, ...) (#t), +static const u8 counters_to_stable_map[] = { +#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, BCH_PERSISTENT_COUNTERS() #undef x - NULL }; static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) @@ -18,13 +17,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) return 0; return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; -}; +} static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { return 0; -}; +} static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) @@ -32,50 +31,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_counters *ctrs = field_to_type(f, counters); unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - for (unsigned i = 0; i < nr; i++) - prt_printf(out, "%s \t%llu\n", - i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", - le64_to_cpu(ctrs->d[i])); -}; + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) + prt_printf(out, "%s \t%llu\n", + bch2_counter_names[i], + le64_to_cpu(ctrs->d[stable])); + } +} int bch2_sb_counters_to_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - u64 val = 0; - for (i = 0; i < BCH_COUNTER_NR; i++) + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) c->counters_on_mount[i] = 0; - for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { - val = le64_to_cpu(ctrs->d[i]); - percpu_u64_set(&c->counters[i], val); - c->counters_on_mount[i] = val; + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) { + u64 v = le64_to_cpu(ctrs->d[stable]); + percpu_u64_set(&c->counters[i], v); + c->counters_on_mount[i] = v; + } } + return 0; -}; +} int bch2_sb_counters_from_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); struct bch_sb_field_counters *ret; - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); if (nr < BCH_COUNTER_NR) { ret = bch2_sb_field_resize(&c->disk_sb, counters, - sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); - + sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); if (ret) { ctrs = ret; nr = bch2_sb_counter_nr_entries(ctrs); } } + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) + ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); + } - for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) - ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); return 0; } diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index fdcf598f08b1..cb44d9ee1ac5 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -95,6 +95,13 @@ enum bch_persistent_counters { BCH_COUNTER_NR }; +enum bch_persistent_counters_stable { +#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_STABLE_NR +}; + struct bch_sb_field_counters { struct bch_sb_field field; __le64 d[]; From 5ee760f667e09b95a19beea265077eb2f69cd12b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 30 Jan 2025 03:33:16 -0500 Subject: [PATCH 009/180] bcachefs: BCH_COUNTER_bucket_discard_fast Add a separate counter for fastpath bucket discards, which don't require a journal flush. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 5 ++++- fs/bcachefs/sb-counters_format.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 3ea809990ef1..43c29b0d2d20 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1897,7 +1897,10 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - count_event(c, bucket_discard); + if (!fastpath) + count_event(c, bucket_discard); + else + count_event(c, bucket_discard_fast); out: fsck_err: if (discard_locked) diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index cb44d9ee1ac5..d0391c5d4c48 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -13,6 +13,7 @@ enum counters_flags { x(io_move, 2, TYPE_SECTORS) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ + x(bucket_discard_fast, 79, TYPE_COUNTER) \ x(bucket_alloc, 5, TYPE_COUNTER) \ x(bucket_alloc_fail, 6, TYPE_COUNTER) \ x(btree_cache_scan, 7, TYPE_COUNTER) \ From 50ca857457e0a983bc6f881fcb1c47f8322a2c48 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 Jan 2025 22:05:02 -0500 Subject: [PATCH 010/180] bcachefs: BCH_IOCTL_QUERY_COUNTERS Add an ioctl for querying counters, the same ones provided in /sys/fs/bcachefs//counters/, but more suitable for a 'bcachefs top' command. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_ioctl.h | 10 +++++++++ fs/bcachefs/chardev.c | 3 +++ fs/bcachefs/sb-counters.c | 43 ++++++++++++++++++++++++++++++++++++ fs/bcachefs/sb-counters.h | 4 ++++ 4 files changed, 60 insertions(+) diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index 3c23bdf788ce..f1b746fac007 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -87,6 +87,7 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) #define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) #define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) +#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) /* ioctl below act on a particular file, not the filesystem as a whole: */ @@ -443,4 +444,13 @@ struct bch_ioctl_query_accounting { struct bkey_i_accounting accounting[]; }; +#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) + +struct bch_ioctl_query_counters { + __u16 nr; + __u16 flags; + __u32 pad; + __u64 d[]; +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 46e9e32105a9..bab49d5ee598 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -11,6 +11,7 @@ #include "move.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-counters.h" #include "super-io.h" #include "thread_with_file.h" @@ -710,6 +711,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); case BCH_IOCTL_QUERY_ACCOUNTING: return bch2_ioctl_query_accounting(c, arg); + case BCH_IOCTL_QUERY_COUNTERS: + return bch2_ioctl_query_counters(c, arg); default: return -ENOTTY; } diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c index 5153a47ec7d4..2b4b8445d418 100644 --- a/fs/bcachefs/sb-counters.c +++ b/fs/bcachefs/sb-counters.c @@ -11,6 +11,13 @@ static const u8 counters_to_stable_map[] = { #undef x }; +const char * const bch2_counter_names[] = { +#define x(t, n, ...) (#t), + BCH_PERSISTENT_COUNTERS() +#undef x + NULL +}; + static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) { if (!ctrs) @@ -102,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = { .validate = bch2_sb_counters_validate, .to_text = bch2_sb_counters_to_text, }; + +#ifndef NO_BCACHEFS_CHARDEV +long bch2_ioctl_query_counters(struct bch_fs *c, + struct bch_ioctl_query_counters __user *user_arg) +{ + struct bch_ioctl_query_counters arg; + int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); + if (ret) + return ret; + + if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || + arg.pad) + return -EINVAL; + + arg.nr = min(arg.nr, BCH_COUNTER_NR); + ret = put_user(arg.nr, &user_arg->nr); + if (ret) + return ret; + + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + + if (stable < arg.nr) { + u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) + ? percpu_u64_get(&c->counters[i]) + : c->counters_on_mount[i]; + + ret = put_user(v, &user_arg->d[stable]); + if (ret) + return ret; + } + } + + return 0; +} +#endif diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h index 81f8aec9fcb1..a4329ad8dd1b 100644 --- a/fs/bcachefs/sb-counters.h +++ b/fs/bcachefs/sb-counters.h @@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *); void bch2_fs_counters_exit(struct bch_fs *); int bch2_fs_counters_init(struct bch_fs *); +extern const char * const bch2_counter_names[]; extern const struct bch_sb_field_ops bch_sb_field_ops_counters; +long bch2_ioctl_query_counters(struct bch_fs *, + struct bch_ioctl_query_counters __user *); + #endif // _BCACHEFS_SB_COUNTERS_H From 3075e68d268844b50b7a8a078510d1c960a1325f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 31 Dec 2024 18:16:17 -0500 Subject: [PATCH 011/180] bcachefs: bch2_data_update_inflight_to_text() Add a new helper for bch2_moving_ctxt_to_text(), which may be used to debug if moving_ios are getting stuck. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 17 ++++++++++++++++- fs/bcachefs/data_update.h | 2 ++ fs/bcachefs/move.c | 5 ++--- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index e2050256136e..1cfb86823fdf 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -430,6 +430,8 @@ int bch2_data_update_index_update(struct bch_write_op *op) void bch2_data_update_read_done(struct data_update *m, struct bch_extent_crc_unpacked crc) { + m->read_done = true; + /* write bio must own pages: */ BUG_ON(!m->op.wbio.bio.bi_vcnt); @@ -541,7 +543,8 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - printbuf_tabstop_push(out, 20); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 20); prt_str_indented(out, "rewrite ptrs:\t"); bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); @@ -565,6 +568,7 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, prt_str_indented(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); + prt_newline(out); } void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) @@ -576,6 +580,17 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); } +void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) +{ + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + prt_newline(out); + printbuf_indent_add(out, 2); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); + prt_printf(out, "read_done:\t\%u\n", m->read_done); + bch2_write_op_to_text(out, &m->op); + printbuf_indent_sub(out, 2); +} + int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index e4b50723428e..7a200e6b770b 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -22,6 +22,7 @@ void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, struct data_update { /* extent being updated: */ + bool read_done; enum btree_id btree_id; struct bkey_buf k; struct data_update_opts data_opts; @@ -31,6 +32,7 @@ struct data_update { }; void bch2_data_update_to_text(struct printbuf *, struct data_update *); +void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); int bch2_data_update_index_update(struct bch_write_op *); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 160b4374160a..2e09f980f3f5 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -88,13 +88,12 @@ static void move_free(struct moving_io *io) if (io->b) atomic_dec(&io->b->count); - bch2_data_update_exit(&io->write); - mutex_lock(&ctxt->lock); list_del(&io->io_list); wake_up(&ctxt->wait); mutex_unlock(&ctxt->lock); + bch2_data_update_exit(&io->write); kfree(io); } @@ -1216,7 +1215,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str mutex_lock(&ctxt->lock); list_for_each_entry(io, &ctxt->ios, io_list) - bch2_write_op_to_text(out, &io->write.op); + bch2_data_update_inflight_to_text(out, &io->write); mutex_unlock(&ctxt->lock); printbuf_indent_sub(out, 4); From 9f37016cb24e650de525fce05e3f4521de228d6b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 17 Jan 2025 10:47:42 -0500 Subject: [PATCH 012/180] bcachefs: kill bch_read_bio.devs_have Dead code. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 2 -- fs/bcachefs/io_read.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index b2b0280d8365..2169c21d94e0 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1070,8 +1070,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->hole = 0; rbio->retry = 0; rbio->context = 0; - /* XXX: only initialize this if needed */ - rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; rbio->subvol = orig->subvol; rbio->read_pos = read_pos; diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index a82e8a94ccb6..6d8b5dc55ada 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -47,8 +47,6 @@ struct bch_read_bio { u16 _state; }; - struct bch_devs_list devs_have; - struct extent_ptr_decoded pick; /* From 9157b3ddfb151d990a5053707d1eaac6055a4844 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 17 Jan 2025 17:40:39 -0500 Subject: [PATCH 013/180] bcachefs: x-macroize BCH_READ flags Will be adding a bch2_read_bio_to_text(). Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 8 ++-- fs/bcachefs/io_read.c | 82 ++++++++++++++++++------------------ fs/bcachefs/io_read.h | 35 +++++++++------ fs/bcachefs/move.c | 4 +- 4 files changed, 69 insertions(+), 60 deletions(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index d9a360782946..b68472f1d1d9 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -159,8 +159,8 @@ static void bchfs_read(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; - int flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE; + int flags = BCH_READ_retry_if_stale| + BCH_READ_may_promote; int ret = 0; rbio->c = c; @@ -221,14 +221,14 @@ static void bchfs_read(struct btree_trans *trans, swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; bch2_bio_page_state_set(&rbio->bio, k); bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; swap(rbio->bio.bi_iter.bi_size, bytes); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 2169c21d94e0..304f76fb0f43 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -105,7 +105,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, if (!have_io_error(failed)) { BUG_ON(!opts.promote_target); - if (!(flags & BCH_READ_MAY_PROMOTE)) + if (!(flags & BCH_READ_may_promote)) return -BCH_ERR_nopromote_may_not; if (bch2_bkey_has_target(c, k, opts.promote_target)) @@ -419,8 +419,8 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio struct bkey_s_c k; int ret; - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; + flags &= ~BCH_READ_last_fragment; + flags |= BCH_READ_must_clone; bch2_bkey_buf_init(&sk); @@ -487,14 +487,14 @@ static void bch2_rbio_retry(struct work_struct *work) rbio = bch2_rbio_free(rbio); - flags |= BCH_READ_IN_RETRY; - flags &= ~BCH_READ_MAY_PROMOTE; + flags |= BCH_READ_in_retry; + flags &= ~BCH_READ_may_promote; - if (flags & BCH_READ_NODECODE) { + if (flags & BCH_READ_data_update) { bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); } else { - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; + flags &= ~BCH_READ_last_fragment; + flags |= BCH_READ_must_clone; __bch2_read(c, rbio, iter, inum, &failed, flags); } @@ -505,7 +505,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, { rbio->retry = retry; - if (rbio->flags & BCH_READ_IN_RETRY) + if (rbio->flags & BCH_READ_in_retry) return; if (retry == READ_ERR) { @@ -712,7 +712,7 @@ static void __bch2_read_endio(struct work_struct *work) if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); - if (rbio->flags & BCH_READ_NODECODE) + if (rbio->flags & BCH_READ_data_update) goto nodecode; /* Adjust crc to point to subset of data we want: */ @@ -759,7 +759,7 @@ static void __bch2_read_endio(struct work_struct *work) rbio->promote = NULL; } nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + if (likely(!(rbio->flags & BCH_READ_in_retry))) { rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); } @@ -772,8 +772,8 @@ static void __bch2_read_endio(struct work_struct *work) * reading into buffers owned by userspace (that userspace can * scribble over) - retry the read, bouncing it this time: */ - if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_MUST_BOUNCE; + if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { + rbio->flags |= BCH_READ_must_bounce; bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); goto out; } @@ -810,11 +810,11 @@ static void bch2_read_endio(struct bio *bio) return; } - if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { trace_and_count(c, read_reuse_race, &rbio->bio); - if (rbio->flags & BCH_READ_RETRY_IF_STALE) + if (rbio->flags & BCH_READ_retry_if_stale) bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); else bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); @@ -941,7 +941,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, * retry path, don't check here, it'll be caught in bch2_read_endio() * and we'll end up in the retry path: */ - if ((flags & BCH_READ_IN_RETRY) && + if ((flags & BCH_READ_in_retry) && !pick.ptr.cached && ca && unlikely(dev_ptr_stale(ca, &pick.ptr))) { @@ -951,7 +951,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto retry_pick; } - if (flags & BCH_READ_NODECODE) { + if (flags & BCH_READ_data_update) { /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: @@ -966,15 +966,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto get_bio; } - if (!(flags & BCH_READ_LAST_FRAGMENT) || + if (!(flags & BCH_READ_last_fragment) || bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_MUST_CLONE; + flags |= BCH_READ_must_clone; - narrow_crcs = !(flags & BCH_READ_IN_RETRY) && + narrow_crcs = !(flags & BCH_READ_in_retry) && bch2_can_narrow_extent_crcs(k, pick.crc); - if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) - flags |= BCH_READ_MUST_BOUNCE; + if (narrow_crcs && (flags & BCH_READ_user_mapped)) + flags |= BCH_READ_must_bounce; EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); @@ -982,8 +982,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, (pick.crc.csum_type != BCH_CSUM_none && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_USER_MAPPED)) || - (flags & BCH_READ_MUST_BOUNCE)))) { + (flags & BCH_READ_user_mapped)) || + (flags & BCH_READ_must_bounce)))) { read_full = true; bounce = true; } @@ -1034,7 +1034,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); rbio->bounce = true; rbio->split = true; - } else if (flags & BCH_READ_MUST_CLONE) { + } else if (flags & BCH_READ_must_clone) { /* * Have to clone if there were any splits, due to error * reporting issues (if a split errored, and retrying didn't @@ -1079,7 +1079,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->promote = promote; INIT_WORK(&rbio->work, NULL); - if (flags & BCH_READ_NODECODE) + if (flags & BCH_READ_data_update) orig->pick = pick; rbio->bio.bi_opf = orig->bio.bi_opf; @@ -1089,7 +1089,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (rbio->bounce) trace_and_count(c, read_bounce, &rbio->bio); - if (!(flags & BCH_READ_NODECODE)) + if (!(flags & BCH_READ_data_update)) this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); @@ -1097,11 +1097,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update)) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); - if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { bio_inc_remaining(&orig->bio); trace_and_count(c, read_split, &orig->bio); } @@ -1110,7 +1110,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, * Unlock the iterator while the btree node's lock is still in * cache, before doing the IO: */ - if (!(flags & BCH_READ_IN_RETRY)) + if (!(flags & BCH_READ_in_retry)) bch2_trans_unlock(trans); else bch2_trans_unlock_long(trans); @@ -1134,10 +1134,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bio_set_dev(&rbio->bio, ca->disk_sb.bdev); if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } else { - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) submit_bio(&rbio->bio); else submit_bio_wait(&rbio->bio); @@ -1155,11 +1155,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto out; } - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } out: - if (likely(!(flags & BCH_READ_IN_RETRY))) { + if (likely(!(flags & BCH_READ_in_retry))) { return 0; } else { bch2_trans_unlock(trans); @@ -1184,7 +1184,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } err: - if (flags & BCH_READ_IN_RETRY) + if (flags & BCH_READ_in_retry) return READ_ERR; orig->bio.bi_status = BLK_STS_IOERR; @@ -1192,16 +1192,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, hole: /* - * won't normally happen in the BCH_READ_NODECODE + * won't normally happen in the BCH_READ_data_update * (bch2_move_extent()) path, but if we retry and the extent we wanted * to read no longer exists we have to signal that: */ - if (flags & BCH_READ_NODECODE) + if (flags & BCH_READ_data_update) orig->hole = true; zero_fill_bio_iter(&orig->bio, iter); out_read_done: - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) bch2_rbio_done(orig); return 0; } @@ -1216,7 +1216,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, struct bkey_s_c k; int ret; - BUG_ON(flags & BCH_READ_NODECODE); + BUG_ON(flags & BCH_READ_data_update); bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, @@ -1266,7 +1266,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, data_btree, k, @@ -1274,7 +1274,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, if (ret) goto err; - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; swap(bvec_iter.bi_size, bytes); diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 6d8b5dc55ada..ef5603daf122 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -106,17 +106,26 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, return 0; } -enum bch_read_flags { - BCH_READ_RETRY_IF_STALE = 1 << 0, - BCH_READ_MAY_PROMOTE = 1 << 1, - BCH_READ_USER_MAPPED = 1 << 2, - BCH_READ_NODECODE = 1 << 3, - BCH_READ_LAST_FRAGMENT = 1 << 4, +#define BCH_READ_FLAGS() \ + x(retry_if_stale) \ + x(may_promote) \ + x(user_mapped) \ + x(data_update) \ + x(last_fragment) \ + x(must_bounce) \ + x(must_clone) \ + x(in_retry) - /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 5, - BCH_READ_MUST_CLONE = 1 << 6, - BCH_READ_IN_RETRY = 1 << 7, +enum __bch_read_flags { +#define x(n) __BCH_READ_##n, + BCH_READ_FLAGS() +#undef x +}; + +enum bch_read_flags { +#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), + BCH_READ_FLAGS() +#undef x }; int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, @@ -148,9 +157,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->subvol = inum.subvol; __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, - BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED); + BCH_READ_retry_if_stale| + BCH_READ_may_promote| + BCH_READ_user_mapped); } static inline struct bch_read_bio *rbio_init(struct bio *bio, diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 2e09f980f3f5..70304d7c234a 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -358,8 +358,8 @@ int bch2_move_extent(struct moving_context *ctxt, bch2_read_extent(trans, &io->rbio, bkey_start_pos(k.k), iter->btree_id, k, 0, - BCH_READ_NODECODE| - BCH_READ_LAST_FRAGMENT); + BCH_READ_data_update| + BCH_READ_last_fragment); return 0; err_free_pages: bio_free_pages(&io->write.op.wbio.bio); From 14e2523fc59d1673766fb2a81b1d2f82134debc0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 19 Jan 2025 13:18:50 -0500 Subject: [PATCH 014/180] bcachefs: Rename BCH_WRITE flags fer consistency with other x-macros enums The uppercase/lowercase style is nice for making the namespace explicit. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 6 +-- fs/bcachefs/data_update.c | 8 ++-- fs/bcachefs/fs-io-buffered.c | 2 +- fs/bcachefs/fs-io-direct.c | 4 +- fs/bcachefs/io_read.c | 2 +- fs/bcachefs/io_write.c | 86 +++++++++++++++++----------------- fs/bcachefs/io_write.h | 29 ++++++------ fs/bcachefs/io_write_types.h | 2 +- fs/bcachefs/rebalance.c | 4 +- 9 files changed, 71 insertions(+), 72 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 5a781fb4c794..1a539e7bedc8 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -728,7 +728,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct bch_dev_usage usage; struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); + cl, flags & BCH_WRITE_alloc_nowait, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -1336,7 +1336,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, if (wp->data_type != BCH_DATA_user) have_cache = true; - if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { + if (target && !(flags & BCH_WRITE_only_specified_devs)) { ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, @@ -1426,7 +1426,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) ret = -BCH_ERR_bucket_alloc_blocked; - if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && + if (cl && !(flags & BCH_WRITE_alloc_nowait) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) ret = -BCH_ERR_bucket_alloc_blocked; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 1cfb86823fdf..bfd8ba162630 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -681,10 +681,10 @@ int bch2_data_update_init(struct btree_trans *trans, m->op.target = data_opts.target; m->op.write_point = wp; m->op.nr_replicas = 0; - m->op.flags |= BCH_WRITE_PAGES_STABLE| - BCH_WRITE_PAGES_OWNED| - BCH_WRITE_DATA_ENCODED| - BCH_WRITE_MOVE| + m->op.flags |= BCH_WRITE_pages_stable| + BCH_WRITE_pages_owned| + BCH_WRITE_data_encoded| + BCH_WRITE_move| m->data_opts.write_flags; m->op.compression_opt = io_opts.background_compression; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index b68472f1d1d9..dc00edfefc91 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -430,7 +430,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) } } - if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + if (io->op.flags & BCH_WRITE_wrote_data_inline) { bio_for_each_folio_all(fi, bio) { struct bch_folio *s; diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 2089c36b5866..d2f2cbaba7a8 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -511,8 +511,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) dio->op.devs_need_flush = &inode->ei_devs_need_flush; if (sync) - dio->op.flags |= BCH_WRITE_SYNC; - dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; + dio->op.flags |= BCH_WRITE_sync; + dio->op.flags |= BCH_WRITE_check_enospc; ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, bio_sectors(bio), true); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 304f76fb0f43..539b48f94523 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -232,7 +232,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, if (!have_io_error(failed)) { update_opts.target = opts.promote_target; update_opts.extra_replicas = 1; - update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; + update_opts.write_flags = BCH_WRITE_alloc_nowait|BCH_WRITE_cached; } else { update_opts.target = opts.foreground_target; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index a903f39caa3e..076e39474610 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -374,7 +374,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_CHECK_ENOSPC); + op->flags & BCH_WRITE_check_enospc); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -403,7 +403,7 @@ static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, (subvol_inum) { op->subvol, op->pos.inode, }, offset << 9); prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); + op->flags & BCH_WRITE_move ? "(internal move)" : ""); } void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) @@ -418,7 +418,7 @@ static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf (subvol_inum) { op->subvol, op->pos.inode, }, offset << 9); prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); + op->flags & BCH_WRITE_move ? "(internal move)" : ""); } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -493,7 +493,7 @@ static void bch2_write_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); bch2_disk_reservation_put(c, &op->res); - if (!(op->flags & BCH_WRITE_MOVE)) + if (!(op->flags & BCH_WRITE_move)) bch2_write_ref_put(c, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); @@ -539,7 +539,7 @@ static void __bch2_write_index(struct bch_write_op *op) unsigned dev; int ret = 0; - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + if (unlikely(op->flags & BCH_WRITE_io_error)) { ret = bch2_write_drop_io_error_ptrs(op); if (ret) goto err; @@ -548,7 +548,7 @@ static void __bch2_write_index(struct bch_write_op *op) if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); - ret = !(op->flags & BCH_WRITE_MOVE) + ret = !(op->flags & BCH_WRITE_move) ? bch2_write_index_default(op) : bch2_data_update_index_update(op); @@ -580,7 +580,7 @@ static void __bch2_write_index(struct bch_write_op *op) err: keys->top = keys->keys; op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; goto out; } @@ -623,8 +623,8 @@ static CLOSURE_CALLBACK(bch2_write_index) struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; - if ((op->flags & BCH_WRITE_SUBMITTED) && - (op->flags & BCH_WRITE_MOVE)) + if ((op->flags & BCH_WRITE_submitted) && + (op->flags & BCH_WRITE_move)) bch2_bio_free_pages_pool(op->c, &op->wbio.bio); spin_lock_irqsave(&wp->writes_lock, flags); @@ -662,11 +662,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work) if (!op) break; - op->flags |= BCH_WRITE_IN_WORKER; + op->flags |= BCH_WRITE_in_worker; __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_SUBMITTED)) + if (!(op->flags & BCH_WRITE_submitted)) __bch2_write(op); else bch2_write_done(&op->cl); @@ -690,7 +690,7 @@ static void bch2_write_endio(struct bio *bio) "data write error: %s", bch2_blk_status_to_str(bio->bi_status))) { set_bit(wbio->dev, op->failed.d); - op->flags |= BCH_WRITE_IO_ERROR; + op->flags |= BCH_WRITE_io_error; } if (wbio->nocow) { @@ -737,7 +737,7 @@ static void init_append_extent(struct bch_write_op *op, bch2_extent_crc_append(&e->k_i, crc); bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, - op->flags & BCH_WRITE_CACHED); + op->flags & BCH_WRITE_cached); bch2_keylist_push(&op->insert_keys); } @@ -854,7 +854,7 @@ static enum prep_encoded_ret { struct bch_fs *c = op->c; struct bio *bio = &op->wbio.bio; - if (!(op->flags & BCH_WRITE_DATA_ENCODED)) + if (!(op->flags & BCH_WRITE_data_encoded)) return PREP_ENCODED_OK; BUG_ON(bio_sectors(bio) != op->crc.compressed_size); @@ -962,9 +962,9 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, if (ec_buf || op->compression_opt || (op->csum_type && - !(op->flags & BCH_WRITE_PAGES_STABLE)) || + !(op->flags & BCH_WRITE_pages_stable)) || (bch2_csum_type_is_encryption(op->csum_type) && - !(op->flags & BCH_WRITE_PAGES_OWNED))) { + !(op->flags & BCH_WRITE_pages_owned))) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, ec_buf); @@ -984,7 +984,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, break; BUG_ON(op->compression_opt && - (op->flags & BCH_WRITE_DATA_ENCODED) && + (op->flags & BCH_WRITE_data_encoded) && bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_opt && !bounce); @@ -1022,7 +1022,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } } - if ((op->flags & BCH_WRITE_DATA_ENCODED) && + if ((op->flags & BCH_WRITE_data_encoded) && !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { @@ -1054,7 +1054,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, crc.compression_type = compression_type; crc.nonce = nonce; } else { - if ((op->flags & BCH_WRITE_DATA_ENCODED) && + if ((op->flags & BCH_WRITE_data_encoded) && bch2_rechecksum_bio(c, src, version, op->crc, NULL, &op->crc, src_len >> 9, @@ -1228,9 +1228,9 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + if (unlikely(op->flags & BCH_WRITE_io_error)) { op->error = -EIO; - } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1259,7 +1259,7 @@ static void bch2_nocow_write(struct bch_write_op *op) struct bucket_to_lock *stale_at; int stale, ret; - if (op->flags & BCH_WRITE_MOVE) + if (op->flags & BCH_WRITE_move) return; darray_init(&buckets); @@ -1317,7 +1317,7 @@ static void bch2_nocow_write(struct bch_write_op *op) }), GFP_KERNEL|__GFP_NOFAIL); if (ptr->unwritten) - op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + op->flags |= BCH_WRITE_convert_unwritten; } /* Unlock before taking nocow locks, doing IO: */ @@ -1325,7 +1325,7 @@ static void bch2_nocow_write(struct bch_write_op *op) bch2_trans_unlock(trans); bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + if (op->flags & BCH_WRITE_convert_unwritten) bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { @@ -1350,7 +1350,7 @@ static void bch2_nocow_write(struct bch_write_op *op) wbio_init(bio)->put_bio = true; bio->bi_opf = op->wbio.bio.bi_opf; } else { - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; } op->pos.offset += bio_sectors(bio); @@ -1364,7 +1364,7 @@ static void bch2_nocow_write(struct bch_write_op *op) op->insert_keys.top, true); bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_SUBMITTED) + if (op->flags & BCH_WRITE_submitted) break; bch2_btree_iter_advance(&iter); } @@ -1384,15 +1384,15 @@ static void bch2_nocow_write(struct bch_write_op *op) bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; } /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_SUBMITTED)) { + if (!(op->flags & BCH_WRITE_submitted)) { closure_sync(&op->cl); __bch2_nocow_write_done(op); op->insert_keys.top = op->insert_keys.keys; - } else if (op->flags & BCH_WRITE_SYNC) { + } else if (op->flags & BCH_WRITE_sync) { closure_sync(&op->cl); bch2_nocow_write_done(&op->cl.work); } else { @@ -1444,7 +1444,7 @@ static void __bch2_write(struct bch_write_op *op) if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { bch2_nocow_write(op); - if (op->flags & BCH_WRITE_SUBMITTED) + if (op->flags & BCH_WRITE_submitted) goto out_nofs_restore; } again: @@ -1474,7 +1474,7 @@ static void __bch2_write(struct bch_write_op *op) ret = bch2_trans_run(c, lockrestart_do(trans, bch2_alloc_sectors_start_trans(trans, op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), op->write_point, &op->devs_have, op->nr_replicas, @@ -1497,10 +1497,10 @@ static void __bch2_write(struct bch_write_op *op) bch2_alloc_sectors_done_inlined(c, wp); err: if (ret <= 0) { - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; if (unlikely(ret < 0)) { - if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { + if (!(op->flags & BCH_WRITE_alloc_nowait)) { struct printbuf buf = PRINTBUF; bch2_write_op_error(&buf, op); prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); @@ -1532,14 +1532,14 @@ static void __bch2_write(struct bch_write_op *op) * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller. */ - if ((op->flags & BCH_WRITE_SYNC) || - (!(op->flags & BCH_WRITE_SUBMITTED) && - !(op->flags & BCH_WRITE_IN_WORKER))) { + if ((op->flags & BCH_WRITE_sync) || + (!(op->flags & BCH_WRITE_submitted) && + !(op->flags & BCH_WRITE_in_worker))) { bch2_wait_on_allocator(c, &op->cl); __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_SUBMITTED)) + if (!(op->flags & BCH_WRITE_submitted)) goto again; bch2_write_done(&op->cl); } else { @@ -1560,8 +1560,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) memset(&op->failed, 0, sizeof(op->failed)); - op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_wrote_data_inline; + op->flags |= BCH_WRITE_submitted; bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); @@ -1624,8 +1624,8 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); - if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - op->flags |= BCH_WRITE_ALLOC_NOWAIT; + if (op->flags & BCH_WRITE_only_specified_devs) + op->flags |= BCH_WRITE_alloc_nowait; op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); @@ -1646,13 +1646,13 @@ CLOSURE_CALLBACK(bch2_write) goto err; } - if (!(op->flags & BCH_WRITE_MOVE) && + if (!(op->flags & BCH_WRITE_move) && !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { op->error = -BCH_ERR_erofs_no_writes; goto err; } - if (!(op->flags & BCH_WRITE_MOVE)) + if (!(op->flags & BCH_WRITE_move)) this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index b4626013abc8..02cca52be0bd 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -23,21 +23,20 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); #define BCH_WRITE_FLAGS() \ - x(ALLOC_NOWAIT) \ - x(CACHED) \ - x(DATA_ENCODED) \ - x(PAGES_STABLE) \ - x(PAGES_OWNED) \ - x(ONLY_SPECIFIED_DEVS) \ - x(WROTE_DATA_INLINE) \ - x(FROM_INTERNAL) \ - x(CHECK_ENOSPC) \ - x(SYNC) \ - x(MOVE) \ - x(IN_WORKER) \ - x(SUBMITTED) \ - x(IO_ERROR) \ - x(CONVERT_UNWRITTEN) + x(alloc_nowait) \ + x(cached) \ + x(data_encoded) \ + x(pages_stable) \ + x(pages_owned) \ + x(only_specified_devs) \ + x(wrote_data_inline) \ + x(check_enospc) \ + x(sync) \ + x(move) \ + x(in_worker) \ + x(submitted) \ + x(io_error) \ + x(convert_unwritten) enum __bch_write_flags { #define x(f) __BCH_WRITE_##f, diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index 6e878a6f2f0b..3ef6df9145ef 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -64,7 +64,7 @@ struct bch_write_op { struct bpos pos; struct bversion version; - /* For BCH_WRITE_DATA_ENCODED: */ + /* For BCH_WRITE_data_encoded: */ struct bch_extent_crc_unpacked crc; struct write_point_specifier write_point; diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index d0a1f5cd5c2b..58f6d97e506c 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -341,7 +341,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, memset(data_opts, 0, sizeof(*data_opts)); data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->write_flags |= BCH_WRITE_only_specified_devs; if (!data_opts->rewrite_ptrs) { /* @@ -449,7 +449,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, { data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->write_flags |= BCH_WRITE_only_specified_devs; return data_opts->rewrite_ptrs != 0; } From 0f856b72286810a1bcf148c92f04d84c36bb4f30 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 14 Jan 2025 15:20:04 -0500 Subject: [PATCH 015/180] bcachefs: rbio_init_fragment() Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 18 +++++++----------- fs/bcachefs/io_read.h | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 539b48f94523..3b474c679fb4 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -177,6 +177,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, struct bch_io_failures *failed) { struct bch_fs *c = trans->c; + struct bch_read_bio *orig = *rbio; struct promote_op *op = NULL; struct bio *bio; unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); @@ -206,7 +207,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, goto err; } - rbio_init(&(*rbio)->bio, opts); + rbio_init_fragment(&(*rbio)->bio, orig); bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { @@ -215,7 +216,6 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, } (*rbio)->bounce = true; - (*rbio)->split = true; (*rbio)->kmalloc = true; if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, @@ -1024,16 +1024,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } else if (bounce) { unsigned sectors = pick.crc.compressed_size; - rbio = rbio_init(bio_alloc_bioset(NULL, + rbio = rbio_init_fragment(bio_alloc_bioset(NULL, DIV_ROUND_UP(sectors, PAGE_SECTORS), 0, GFP_NOFS, &c->bio_read_split), - orig->opts); + orig); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); rbio->bounce = true; - rbio->split = true; } else if (flags & BCH_READ_must_clone) { /* * Have to clone if there were any splits, due to error @@ -1043,11 +1042,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, &c->bio_read_split), - orig->opts); + orig); rbio->bio.bi_iter = iter; - rbio->split = true; } else { rbio = orig; rbio->bio.bi_iter = iter; @@ -1058,9 +1056,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->c = c; rbio->submit_time = local_clock(); - if (rbio->split) - rbio->parent = orig; - else + if (!rbio->split) rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; rbio->offset_into_extent= offset_into_extent; diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index ef5603daf122..11fdf73a38b1 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -162,6 +162,20 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, BCH_READ_user_mapped); } + +static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, + struct bch_read_bio *orig) +{ + struct bch_read_bio *rbio = to_rbio(bio); + + rbio->_state = 0; + rbio->split = true; + rbio->parent = orig; + rbio->promote = NULL; + rbio->opts = orig->opts; + return rbio; +} + static inline struct bch_read_bio *rbio_init(struct bio *bio, struct bch_io_opts opts) { From dfa204b169ed29e4cdcb5a20d4cba2f579235543 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 15 Jan 2025 12:59:43 -0500 Subject: [PATCH 016/180] bcachefs: rbio_init() cleanup Move more initialization to rbio_init(), to assist in further cleanups. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 13 ++++++------- fs/bcachefs/fs-io-direct.c | 16 ++++++++++++---- fs/bcachefs/io_read.c | 18 ++++++++---------- fs/bcachefs/io_read.h | 17 ++++++++++------- fs/bcachefs/move.c | 8 +++++--- 5 files changed, 41 insertions(+), 31 deletions(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index dc00edfefc91..0ec2eebdeffa 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -163,8 +163,6 @@ static void bchfs_read(struct btree_trans *trans, BCH_READ_may_promote; int ret = 0; - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); @@ -290,12 +288,13 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_readpages_end_io); readpage_iter_advance(&readpages_iter); rbio->bio.bi_iter.bi_sector = folio_sector(folio); - rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bchfs_read(trans, rbio, inode_inum(inode), @@ -333,10 +332,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_read_single_folio_end_io); rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index d2f2cbaba7a8..535bc5fcbcc0 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -73,6 +73,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct blk_plug plug; loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); + bool split = false; size_t shorten; ssize_t ret; @@ -99,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) GFP_KERNEL, &c->dio_read_bioset); - bio->bi_end_io = bch2_direct_IO_read_endio; - dio = container_of(bio, struct dio_read, rbio.bio); closure_init(&dio->cl, NULL); @@ -133,12 +132,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) goto start; while (iter->count) { + split = true; + bio = bio_alloc_bioset(NULL, bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), REQ_OP_READ, GFP_KERNEL, &c->bio_read); - bio->bi_end_io = bch2_direct_IO_read_split_endio; start: bio->bi_opf = REQ_OP_READ|REQ_SYNC; bio->bi_iter.bi_sector = offset >> 9; @@ -160,7 +160,15 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) if (iter->count) closure_get(&dio->cl); - bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + struct bch_read_bio *rbio = + rbio_init(bio, + c, + opts, + split + ? bch2_direct_IO_read_split_endio + : bch2_direct_IO_read_endio); + + bch2_read(c, rbio, inode_inum(inode)); } blk_finish_plug(&plug); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 3b474c679fb4..aa6536d8c62e 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -171,13 +171,12 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, struct bkey_s_c k, struct bpos pos, struct extent_ptr_decoded *pick, - struct bch_io_opts opts, unsigned sectors, + struct bch_read_bio *orig, struct bch_read_bio **rbio, struct bch_io_failures *failed) { struct bch_fs *c = trans->c; - struct bch_read_bio *orig = *rbio; struct promote_op *op = NULL; struct bio *bio; unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); @@ -230,11 +229,11 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, struct data_update_opts update_opts = {}; if (!have_io_error(failed)) { - update_opts.target = opts.promote_target; + update_opts.target = orig->opts.promote_target; update_opts.extra_replicas = 1; update_opts.write_flags = BCH_WRITE_alloc_nowait|BCH_WRITE_cached; } else { - update_opts.target = opts.foreground_target; + update_opts.target = orig->opts.foreground_target; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); unsigned ptr_bit = 1; @@ -247,7 +246,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), - opts, + orig->opts, update_opts, btree_id, k); /* @@ -279,8 +278,8 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, struct bvec_iter iter, struct bkey_s_c k, struct extent_ptr_decoded *pick, - struct bch_io_opts opts, unsigned flags, + struct bch_read_bio *orig, struct bch_read_bio **rbio, bool *bounce, bool *read_full, @@ -304,7 +303,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, struct promote_op *promote; int ret; - ret = should_promote(c, k, pos, opts, flags, failed); + ret = should_promote(c, k, pos, orig->opts, flags, failed); if (ret) goto nopromote; @@ -312,7 +311,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_reflink : BTREE_ID_extents, - k, pos, pick, opts, sectors, rbio, failed); + k, pos, pick, sectors, orig, rbio, failed); ret = PTR_ERR_OR_ZERO(promote); if (ret) goto nopromote; @@ -989,7 +988,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } if (orig->opts.promote_target || have_io_error(failed)) - promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, + promote = promote_alloc(trans, iter, k, &pick, flags, orig, &rbio, &bounce, &read_full, failed); if (!read_full) { @@ -1054,7 +1053,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - rbio->c = c; rbio->submit_time = local_clock(); if (!rbio->split) rbio->end_io = orig->bio.bi_end_io; diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 11fdf73a38b1..5be4f4b35568 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -152,8 +152,6 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, BUG_ON(rbio->_state); - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, @@ -162,12 +160,12 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, BCH_READ_user_mapped); } - static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, struct bch_read_bio *orig) { struct bch_read_bio *rbio = to_rbio(bio); + rbio->c = orig->c; rbio->_state = 0; rbio->split = true; rbio->parent = orig; @@ -177,13 +175,18 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, } static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_io_opts opts) + struct bch_fs *c, + struct bch_io_opts opts, + bio_end_io_t end_io) { struct bch_read_bio *rbio = to_rbio(bio); - rbio->_state = 0; - rbio->promote = NULL; - rbio->opts = opts; + rbio->start_time = local_clock(); + rbio->c = c; + rbio->_state = 0; + rbio->promote = NULL; + rbio->opts = opts; + rbio->bio.bi_end_io = end_io; return rbio; } diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 70304d7c234a..b8ac6dd28471 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -307,8 +307,6 @@ int bch2_move_extent(struct moving_context *ctxt, GFP_KERNEL)) goto err_free; - io->rbio.c = c; - io->rbio.opts = io_opts; bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); io->rbio.bio.bi_vcnt = pages; io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); @@ -316,7 +314,11 @@ int bch2_move_extent(struct moving_context *ctxt, io->rbio.bio.bi_opf = REQ_OP_READ; io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - io->rbio.bio.bi_end_io = move_read_endio; + + rbio_init(&io->rbio.bio, + c, + io_opts, + move_read_endio); ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, io_opts, data_opts, iter->btree_id, k); From a70bd976303296940ebc7611660ed0dedfeb1fd7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 15 Jan 2025 18:53:55 -0500 Subject: [PATCH 017/180] bcachefs: data_update now embeds bch_read_bio Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.h | 5 +++++ fs/bcachefs/move.c | 28 ++++++++++++---------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 7a200e6b770b..d1dd4819a8a3 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -4,6 +4,7 @@ #define _BCACHEFS_DATA_UPDATE_H #include "bkey_buf.h" +#include "io_read.h" #include "io_write_types.h" struct moving_context; @@ -28,7 +29,11 @@ struct data_update { struct data_update_opts data_opts; struct moving_context *ctxt; struct bch_move_stats *stats; + + struct bch_read_bio rbio; struct bch_write_op op; + /* Must be last since it is variable size */ + struct bio_vec bi_inline_vecs[]; }; void bch2_data_update_to_text(struct printbuf *, struct data_update *); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index b8ac6dd28471..6ff1459e3e2e 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -74,11 +74,7 @@ struct moving_io { unsigned read_sectors; unsigned write_sectors; - struct bch_read_bio rbio; - struct data_update write; - /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[]; }; static void move_free(struct moving_io *io) @@ -113,7 +109,7 @@ static void move_write_done(struct bch_write_op *op) static void move_write(struct moving_io *io) { - if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { + if (unlikely(io->write.rbio.bio.bi_status || io->write.rbio.hole)) { move_free(io); return; } @@ -131,7 +127,7 @@ static void move_write(struct moving_io *io) atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); - bch2_data_update_read_done(&io->write, io->rbio.pick.crc); + bch2_data_update_read_done(&io->write, io->write.rbio.pick.crc); } struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) @@ -144,7 +140,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx static void move_read_endio(struct bio *bio) { - struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); + struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); struct moving_context *ctxt = io->write.ctxt; atomic_sub(io->read_sectors, &ctxt->read_sectors); @@ -299,7 +295,7 @@ int bch2_move_extent(struct moving_context *ctxt, io->read_sectors = k.k->size; io->write_sectors = k.k->size; - bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); + bio_init(&io->write.op.wbio.bio, NULL, io->write.bi_inline_vecs, pages, 0); io->write.op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); @@ -307,15 +303,15 @@ int bch2_move_extent(struct moving_context *ctxt, GFP_KERNEL)) goto err_free; - bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); - io->rbio.bio.bi_vcnt = pages; - io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - io->rbio.bio.bi_iter.bi_size = sectors << 9; + bio_init(&io->write.rbio.bio, NULL, io->write.bi_inline_vecs, pages, 0); + io->write.rbio.bio.bi_vcnt = pages; + io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + io->write.rbio.bio.bi_iter.bi_size = sectors << 9; - io->rbio.bio.bi_opf = REQ_OP_READ; - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + io->write.rbio.bio.bi_opf = REQ_OP_READ; + io->write.rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - rbio_init(&io->rbio.bio, + rbio_init(&io->write.rbio.bio, c, io_opts, move_read_endio); @@ -357,7 +353,7 @@ int bch2_move_extent(struct moving_context *ctxt, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(trans, &io->rbio, + bch2_read_extent(trans, &io->write.rbio, bkey_start_pos(k.k), iter->btree_id, k, 0, BCH_READ_data_update| From 8f97793d67a2bd75345c8e2ac1664950186bf2fb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 15 Jan 2025 22:22:29 -0500 Subject: [PATCH 018/180] bcachefs: promote_op uses embedded bch_read_bio Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 106 ++++++++++++++++++------------------------ fs/bcachefs/io_read.h | 8 +--- 2 files changed, 48 insertions(+), 66 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index aa6536d8c62e..5fcb1947db6e 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -166,15 +166,14 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) bch2_data_update_read_done(&op->write, rbio->pick.crc); } -static struct promote_op *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - unsigned sectors, - struct bch_read_bio *orig, - struct bch_read_bio **rbio, - struct bch_io_failures *failed) +static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + unsigned sectors, + struct bch_read_bio *orig, + struct bch_io_failures *failed) { struct bch_fs *c = trans->c; struct promote_op *op = NULL; @@ -188,34 +187,25 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); if (!op) { ret = -BCH_ERR_nopromote_enomem; - goto err; + goto err_put; } op->start_time = local_clock(); op->pos = pos; - /* - * We don't use the mempool here because extents that aren't - * checksummed or compressed can be too big for the mempool: - */ - *rbio = kzalloc(sizeof(struct bch_read_bio) + - sizeof(struct bio_vec) * pages, - GFP_KERNEL); - if (!*rbio) { + rbio_init_fragment(&op->write.rbio.bio, orig); + bio_init(&op->write.rbio.bio, + NULL, + op->write.bi_inline_vecs, + pages, 0); + + if (bch2_bio_alloc_pages(&op->write.rbio.bio, sectors << 9, GFP_KERNEL)) { ret = -BCH_ERR_nopromote_enomem; goto err; } - rbio_init_fragment(&(*rbio)->bio, orig); - bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); - - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { - ret = -BCH_ERR_nopromote_enomem; - goto err; - } - - (*rbio)->bounce = true; - (*rbio)->kmalloc = true; + op->write.rbio.bounce = true; + op->write.rbio.promote = true; if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { @@ -260,27 +250,23 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, } op->write.op.end_io = promote_done; - - return op; + return &op->write.rbio; err: - if (*rbio) - bio_free_pages(&(*rbio)->bio); - kfree(*rbio); - *rbio = NULL; + bio_free_pages(&op->write.rbio.bio); /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); +err_put: bch2_write_ref_put(c, BCH_WRITE_REF_promote); return ERR_PTR(ret); } noinline -static struct promote_op *promote_alloc(struct btree_trans *trans, +static struct bch_read_bio *promote_alloc(struct btree_trans *trans, struct bvec_iter iter, struct bkey_s_c k, struct extent_ptr_decoded *pick, unsigned flags, struct bch_read_bio *orig, - struct bch_read_bio **rbio, bool *bounce, bool *read_full, struct bch_io_failures *failed) @@ -300,18 +286,18 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, struct bpos pos = promote_full ? bkey_start_pos(k.k) : POS(k.k->p.inode, iter.bi_sector); - struct promote_op *promote; int ret; ret = should_promote(c, k, pos, orig->opts, flags, failed); if (ret) goto nopromote; - promote = __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, sectors, orig, rbio, failed); + struct bch_read_bio *promote = + __promote_alloc(trans, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, sectors, orig, failed); ret = PTR_ERR_OR_ZERO(promote); if (ret) goto nopromote; @@ -374,20 +360,24 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { BUG_ON(rbio->bounce && !rbio->split); - if (rbio->promote) - promote_free(rbio->c, rbio->promote); - rbio->promote = NULL; - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - if (rbio->split) { struct bch_read_bio *parent = rbio->parent; - if (rbio->kmalloc) - kfree(rbio); - else + if (rbio->promote) { + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); + + if (!rbio->bio.bi_status) { + promote_start(op, rbio); + } else { + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + promote_free(rbio->c, op); + } + } else { + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + bio_put(&rbio->bio); + } rbio = parent; } @@ -482,7 +472,8 @@ static void bch2_rbio_retry(struct work_struct *work) if (rbio->retry == READ_RETRY_AVOID) bch2_mark_io_failure(&failed, &rbio->pick); - rbio->bio.bi_status = 0; + if (!rbio->split) + rbio->bio.bi_status = 0; rbio = bch2_rbio_free(rbio); @@ -753,9 +744,6 @@ static void __bch2_read_endio(struct work_struct *work) ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (ret) goto decrypt_err; - - promote_start(rbio->promote, rbio); - rbio->promote = NULL; } nodecode: if (likely(!(rbio->flags & BCH_READ_in_retry))) { @@ -887,7 +875,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); int pick_ret; @@ -988,8 +975,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } if (orig->opts.promote_target || have_io_error(failed)) - promote = promote_alloc(trans, iter, k, &pick, flags, orig, - &rbio, &bounce, &read_full, failed); + rbio = promote_alloc(trans, iter, k, &pick, flags, orig, + &bounce, &read_full, failed); if (!read_full) { EBUG_ON(crc_is_compressed(pick.crc)); @@ -1070,7 +1057,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->data_btree = data_btree; rbio->data_pos = data_pos; rbio->version = k.k->bversion; - rbio->promote = promote; INIT_WORK(&rbio->work, NULL); if (flags & BCH_READ_data_update) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 5be4f4b35568..f54c9943e34a 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -35,9 +35,9 @@ struct bch_read_bio { u16 flags; union { struct { - u16 bounce:1, + u16 promote:1, + bounce:1, split:1, - kmalloc:1, have_ioref:1, narrow_crcs:1, hole:1, @@ -63,8 +63,6 @@ struct bch_read_bio { struct bpos data_pos; struct bversion version; - struct promote_op *promote; - struct bch_io_opts opts; struct work_struct work; @@ -169,7 +167,6 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, rbio->_state = 0; rbio->split = true; rbio->parent = orig; - rbio->promote = NULL; rbio->opts = orig->opts; return rbio; } @@ -184,7 +181,6 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, rbio->start_time = local_clock(); rbio->c = c; rbio->_state = 0; - rbio->promote = NULL; rbio->opts = opts; rbio->bio.bi_end_io = end_io; return rbio; From 536d789781c66e3e3ae447b8116952d5ce689e6b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 17 Jan 2025 14:26:30 -0500 Subject: [PATCH 019/180] bcachefs: bch2_update_unwritten_extent() no longer depends on wbio Prep work for improving bch2_data_update_init(). Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 26 +++++++++++++++----------- fs/bcachefs/errcode.h | 4 ++++ fs/bcachefs/move.c | 2 +- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index bfd8ba162630..f5b7e6050f40 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -456,23 +456,23 @@ void bch2_data_update_exit(struct data_update *update) bch2_bio_free_pages_pool(c, &update->op.wbio.bio); } -static void bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) +static int bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) { struct bch_fs *c = update->op.c; - struct bio *bio = &update->op.wbio.bio; struct bkey_i_extent *e; struct write_point *wp; struct closure cl; struct btree_iter iter; struct bkey_s_c k; - int ret; + int ret = 0; closure_init_stack(&cl); bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); - while (bio_sectors(bio)) { - unsigned sectors = bio_sectors(bio); + while (bpos_lt(update->op.pos, update->k.k->k.p)) { + unsigned sectors = update->k.k->k.p.offset - + update->op.pos.offset; bch2_trans_begin(trans); @@ -508,7 +508,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch_err_fn_ratelimited(c, ret); if (ret) - return; + break; sectors = min(sectors, wp->sectors_free); @@ -518,7 +518,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); bch2_alloc_sectors_done(c, wp); - bio_advance(bio, sectors << 9); update->op.pos.offset += sectors; extent_for_each_ptr(extent_i_to_s(e), ptr) @@ -537,6 +536,8 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch2_trans_unlock(trans); closure_sync(&cl); } + + return ret; } void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, @@ -657,10 +658,10 @@ int bch2_data_update_init(struct btree_trans *trans, * snapshots table - just skip it, we can move it later. */ if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done; + return -BCH_ERR_data_update_done_no_snapshot; if (!bkey_get_dev_refs(c, k)) - return -BCH_ERR_data_update_done; + return -BCH_ERR_data_update_done_no_dev_refs; if (c->opts.nocow_enabled && !bkey_nocow_lock(c, ctxt, k)) { @@ -758,6 +759,8 @@ int bch2_data_update_init(struct btree_trans *trans, /* if iter == NULL, it's just a promote */ if (iter) ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); + if (!ret) + ret = -BCH_ERR_data_update_done_no_writes_needed; goto out; } @@ -771,7 +774,8 @@ int bch2_data_update_init(struct btree_trans *trans, } if (bkey_extent_is_unwritten(k)) { - bch2_update_unwritten_extent(trans, m); + ret = bch2_update_unwritten_extent(trans, m) ?: + -BCH_ERR_data_update_done_unwritten; goto out; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 712877036612..82f950ea1c26 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -181,6 +181,10 @@ x(EINVAL, not_in_recovery) \ x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ + x(BCH_ERR_data_update_done, data_update_done_unwritten) \ + x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ + x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ + x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ x(EINVAL, mismatched_block_size) \ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 6ff1459e3e2e..03f071827c5c 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -364,7 +364,7 @@ int bch2_move_extent(struct moving_context *ctxt, err_free: kfree(io); err: - if (ret == -BCH_ERR_data_update_done) + if (bch2_err_matches(ret, BCH_ERR_data_update_done)) return 0; if (bch2_err_matches(ret, EROFS) || From 6f7111f820d5b956bab7ff744ecae953e2bf8e4f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 16 Jan 2025 00:40:43 -0500 Subject: [PATCH 020/180] bcachefs: cleanup redundant code around data_update_op initialization Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 35 +++++++++++++-- fs/bcachefs/data_update.h | 6 +-- fs/bcachefs/io_read.c | 92 ++++++++++++++------------------------- fs/bcachefs/move.c | 45 +++---------------- 4 files changed, 73 insertions(+), 105 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index f5b7e6050f40..5873abf0a0b2 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -20,6 +20,8 @@ #include "subvolume.h" #include "trace.h" +#include + static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -427,16 +429,15 @@ int bch2_data_update_index_update(struct bch_write_op *op) return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); } -void bch2_data_update_read_done(struct data_update *m, - struct bch_extent_crc_unpacked crc) +void bch2_data_update_read_done(struct data_update *m) { m->read_done = true; /* write bio must own pages: */ BUG_ON(!m->op.wbio.bio.bi_vcnt); - m->op.crc = crc; - m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + m->op.crc = m->rbio.pick.crc; + m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; this_cpu_add(m->op.c->counters[BCH_COUNTER_move_extent_write], m->k.k->k.size); @@ -454,6 +455,8 @@ void bch2_data_update_exit(struct data_update *update) bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); bch2_bio_free_pages_pool(c, &update->op.wbio.bio); + kfree(update->bvecs); + update->bvecs = NULL; } static int bch2_update_unwritten_extent(struct btree_trans *trans, @@ -779,7 +782,31 @@ int bch2_data_update_init(struct btree_trans *trans, goto out; } + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); + + m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); + if (!m->bvecs) + goto enomem; + + bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); + bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); + + if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) + goto enomem; + + rbio_init(&m->rbio.bio, c, io_opts, NULL); + m->rbio.bio.bi_iter.bi_size = buf_bytes; + m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + return 0; +enomem: + ret = -ENOMEM; out: bch2_data_update_exit(m); return ret ?: -BCH_ERR_data_update_done; diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index d1dd4819a8a3..f4cf5d17cc37 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -32,8 +32,7 @@ struct data_update { struct bch_read_bio rbio; struct bch_write_op op; - /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[]; + struct bio_vec *bvecs; }; void bch2_data_update_to_text(struct printbuf *, struct data_update *); @@ -41,8 +40,7 @@ void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); int bch2_data_update_index_update(struct bch_write_op *); -void bch2_data_update_read_done(struct data_update *, - struct bch_extent_crc_unpacked); +void bch2_data_update_read_done(struct data_update *); int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 5fcb1947db6e..b00d43c4fdcf 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -125,45 +125,37 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, return 0; } -static void promote_free(struct bch_fs *c, struct promote_op *op) +static noinline void promote_free(struct bch_read_bio *rbio) { - int ret; + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); + struct bch_fs *c = rbio->c; + + int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); bch2_data_update_exit(&op->write); - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); bch2_write_ref_put(c, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } static void promote_done(struct bch_write_op *wop) { - struct promote_op *op = - container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.op.c; + struct promote_op *op = container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.rbio.c; - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], - op->start_time); - promote_free(c, op); + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); + promote_free(&op->write.rbio); } -static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +static noinline void promote_start(struct bch_read_bio *rbio) { - struct bio *bio = &op->write.op.wbio.bio; + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); trace_and_count(op->write.op.c, read_promote, &rbio->bio); - /* we now own pages: */ - BUG_ON(!rbio->bounce); - BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); - - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - - bch2_data_update_read_done(&op->write, rbio->pick.crc); + bch2_data_update_read_done(&op->write); } static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, @@ -176,15 +168,12 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, struct bch_io_failures *failed) { struct bch_fs *c = trans->c; - struct promote_op *op = NULL; - struct bio *bio; - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); - op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); + struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); if (!op) { ret = -BCH_ERR_nopromote_enomem; goto err_put; @@ -193,29 +182,12 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, op->start_time = local_clock(); op->pos = pos; - rbio_init_fragment(&op->write.rbio.bio, orig); - bio_init(&op->write.rbio.bio, - NULL, - op->write.bi_inline_vecs, - pages, 0); - - if (bch2_bio_alloc_pages(&op->write.rbio.bio, sectors << 9, GFP_KERNEL)) { - ret = -BCH_ERR_nopromote_enomem; - goto err; - } - - op->write.rbio.bounce = true; - op->write.rbio.promote = true; - if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { ret = -BCH_ERR_nopromote_in_flight; goto err; } - bio = &op->write.op.wbio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - struct data_update_opts update_opts = {}; if (!have_io_error(failed)) { @@ -243,16 +215,20 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, * possible errors: -BCH_ERR_nocow_lock_blocked, * -BCH_ERR_ENOSPC_disk_reservation: */ - if (ret) { - BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params)); - goto err; - } + if (ret) + goto err_remove_hash; + rbio_init_fragment(&op->write.rbio.bio, orig); + op->write.rbio.bounce = true; + op->write.rbio.promote = true; op->write.op.end_io = promote_done; + return &op->write.rbio; +err_remove_hash: + BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params)); err: - bio_free_pages(&op->write.rbio.bio); + bio_free_pages(&op->write.op.wbio.bio); /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); err_put: @@ -363,15 +339,11 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) if (rbio->split) { struct bch_read_bio *parent = rbio->parent; - if (rbio->promote) { - struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - - if (!rbio->bio.bi_status) { - promote_start(op, rbio); - } else { - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - promote_free(rbio->c, op); - } + if (unlikely(rbio->promote)) { + if (!rbio->bio.bi_status) + promote_start(rbio); + else + promote_free(rbio); } else { if (rbio->bounce) bch2_bio_free_pages_pool(rbio->c, &rbio->bio); @@ -938,11 +910,13 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } if (flags & BCH_READ_data_update) { + struct data_update *u = container_of(orig, struct data_update, rbio); + /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { + if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) percpu_ref_put(&ca->io_ref); goto hole; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 03f071827c5c..d825493cac25 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -127,7 +127,7 @@ static void move_write(struct moving_io *io) atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); - bch2_data_update_read_done(&io->write, io->write.rbio.pick.crc); + bch2_data_update_read_done(&io->write); } struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) @@ -253,11 +253,6 @@ int bch2_move_extent(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct moving_io *io; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned sectors = k.k->size, pages; int ret = -ENOMEM; trace_move_extent2(c, k, &io_opts, &data_opts); @@ -280,13 +275,7 @@ int bch2_move_extent(struct moving_context *ctxt, */ bch2_trans_unlock(trans); - /* write path might have to decompress data: */ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); - - pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - io = kzalloc(sizeof(struct moving_io) + - sizeof(struct bio_vec) * pages, GFP_KERNEL); + struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); if (!io) goto err; @@ -295,31 +284,13 @@ int bch2_move_extent(struct moving_context *ctxt, io->read_sectors = k.k->size; io->write_sectors = k.k->size; - bio_init(&io->write.op.wbio.bio, NULL, io->write.bi_inline_vecs, pages, 0); - io->write.op.wbio.bio.bi_ioprio = - IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - - if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, - GFP_KERNEL)) - goto err_free; - - bio_init(&io->write.rbio.bio, NULL, io->write.bi_inline_vecs, pages, 0); - io->write.rbio.bio.bi_vcnt = pages; - io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - io->write.rbio.bio.bi_iter.bi_size = sectors << 9; - - io->write.rbio.bio.bi_opf = REQ_OP_READ; - io->write.rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - - rbio_init(&io->write.rbio.bio, - c, - io_opts, - move_read_endio); - ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, io_opts, data_opts, iter->btree_id, k); if (ret) - goto err_free_pages; + goto err_free; + + io->write.rbio.bio.bi_end_io = move_read_endio; + io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); io->write.op.end_io = move_write_done; @@ -359,8 +330,6 @@ int bch2_move_extent(struct moving_context *ctxt, BCH_READ_data_update| BCH_READ_last_fragment); return 0; -err_free_pages: - bio_free_pages(&io->write.op.wbio.bio); err_free: kfree(io); err: @@ -624,7 +593,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; - if (ret2 == -ENOMEM) { + if (bch2_err_matches(ret2, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); continue; From d0148e7169d5a62e5b03c2ecc3a91e508cb7d9ba Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 16 Jan 2025 03:43:03 -0500 Subject: [PATCH 021/180] bcachefs: Be stricter in bch2_read_retry_nodecode() Now that data_update embeds bch_read_bio, BCH_READ_NODECODE means that the read is embedded in a a data_update - and we can check in the retry path if the extent has changed and bail out. This likely fixes some subtle bugs with read errors and data moves. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 64 ++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 40 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index b00d43c4fdcf..829a4f1b2f03 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -369,61 +369,47 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } -static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, +static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) { + struct data_update *u = container_of(rbio, struct data_update, rbio); struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - int ret; - - flags &= ~BCH_READ_last_fragment; - flags |= BCH_READ_must_clone; - - bch2_bkey_buf_init(&sk); - - bch2_trans_iter_init(trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_slots); retry: bch2_trans_begin(trans); - rbio->bio.bi_status = 0; - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + u->btree_id, bkey_start_pos(&u->k.k->k), + 0))); if (ret) goto err; - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - if (!bch2_bkey_matches_ptr(c, k, - rbio->pick.ptr, - rbio->data_pos.offset - - rbio->pick.crc.offset)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ rbio->hole = true; - goto out; + goto err; } ret = __bch2_read_extent(trans, rbio, bvec_iter, - rbio->read_pos, - rbio->data_btree, - k, 0, failed, flags); + bkey_start_pos(&u->k.k->k), + u->btree_id, + bkey_i_to_s_c(u->k.k), + 0, failed, flags); +err: + bch2_trans_iter_exit(trans, &iter); + if (ret == READ_RETRY) goto retry; if (ret) - goto err; -out: + rbio->bio.bi_status = BLK_STS_IOERR; + + BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); bch2_rbio_done(rbio); - bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); - return; -err: - rbio->bio.bi_status = BLK_STS_IOERR; - goto out; } static void bch2_rbio_retry(struct work_struct *work) @@ -451,15 +437,13 @@ static void bch2_rbio_retry(struct work_struct *work) flags |= BCH_READ_in_retry; flags &= ~BCH_READ_may_promote; + flags &= ~BCH_READ_last_fragment; + flags |= BCH_READ_must_clone; - if (flags & BCH_READ_data_update) { + if (flags & BCH_READ_data_update) bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); - } else { - flags &= ~BCH_READ_last_fragment; - flags |= BCH_READ_must_clone; - + else __bch2_read(c, rbio, iter, inum, &failed, flags); - } } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, From 8ff92a9e4e49fc1fa01e8d23462097ccbe90e3b6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 19 Jan 2025 13:11:24 -0500 Subject: [PATCH 022/180] bcachefs: Promotes should use BCH_WRITE_only_specified_devs Promotes, like most other internal moves, should only go to the specified target and not fall back to allocating from the full filesystem. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 829a4f1b2f03..7a66feb1c011 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -194,6 +194,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, update_opts.target = orig->opts.promote_target; update_opts.extra_replicas = 1; update_opts.write_flags = BCH_WRITE_alloc_nowait|BCH_WRITE_cached; + update_opts.write_flags |= BCH_WRITE_only_specified_devs; } else { update_opts.target = orig->opts.foreground_target; From 29ad31c780d1e4e37244140442aaec41c3efb7d6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 18 Jan 2025 02:05:57 -0500 Subject: [PATCH 023/180] bcachefs: Self healing writes are BCH_WRITE_alloc_nowait If a drive is failing and we're moving data off of it, we can't necessairly depend on capacity/disk reservation calculations to avoid deadlocking/blocking on the allocator. And, we don't want to queue up infinite self healing moves anyways. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 7a66feb1c011..bdb554f6db8c 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -188,12 +188,12 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, goto err; } - struct data_update_opts update_opts = {}; + struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; if (!have_io_error(failed)) { update_opts.target = orig->opts.promote_target; update_opts.extra_replicas = 1; - update_opts.write_flags = BCH_WRITE_alloc_nowait|BCH_WRITE_cached; + update_opts.write_flags |= BCH_WRITE_cached; update_opts.write_flags |= BCH_WRITE_only_specified_devs; } else { update_opts.target = orig->opts.foreground_target; From c37d42a0e2be210e4e9b60a5a1092e1f139b64a0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 19 Jan 2025 13:43:44 -0500 Subject: [PATCH 024/180] bcachefs: Rework init order in bch2_data_update_init() Initialize the write op first, so that in the next patch we can check if the allocator would block (for BCH_WRITE_alloc_nowait ops) and bail out before taking nocow locks/dev refs. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 54 ++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 5873abf0a0b2..3e8ad94dca59 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -35,7 +35,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - if (!bch2_dev_tryget(c, ptr->dev)) { + if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { bkey_for_each_ptr(ptrs, ptr2) { if (ptr2 == ptr) break; @@ -449,14 +449,15 @@ void bch2_data_update_exit(struct data_update *update) struct bch_fs *c = update->op.c; struct bkey_s_c k = bkey_i_to_s_c(update->k.k); - if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); - bkey_put_dev_refs(c, k); - bch2_bkey_buf_exit(&update->k, c); - bch2_disk_reservation_put(c, &update->op.res); bch2_bio_free_pages_pool(c, &update->op.wbio.bio); kfree(update->bvecs); update->bvecs = NULL; + + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); + bkey_put_dev_refs(c, k); + bch2_disk_reservation_put(c, &update->op.res); + bch2_bkey_buf_exit(&update->k, c); } static int bch2_update_unwritten_extent(struct btree_trans *trans, @@ -663,15 +664,6 @@ int bch2_data_update_init(struct btree_trans *trans, if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) return -BCH_ERR_data_update_done_no_snapshot; - if (!bkey_get_dev_refs(c, k)) - return -BCH_ERR_data_update_done_no_dev_refs; - - if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, k)) { - bkey_put_dev_refs(c, k); - return -BCH_ERR_nocow_lock_blocked; - } - bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; @@ -764,7 +756,7 @@ int bch2_data_update_init(struct btree_trans *trans, ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); if (!ret) ret = -BCH_ERR_data_update_done_no_writes_needed; - goto out; + goto out_bkey_buf_exit; } if (reserve_sectors) { @@ -773,13 +765,24 @@ int bch2_data_update_init(struct btree_trans *trans, ? 0 : BCH_DISK_RESERVATION_NOFAIL); if (ret) - goto out; + goto out_bkey_buf_exit; + } + + if (!bkey_get_dev_refs(c, k)) { + ret = -BCH_ERR_data_update_done_no_dev_refs; + goto out_put_disk_res; + } + + if (c->opts.nocow_enabled && + !bkey_nocow_lock(c, ctxt, k)) { + ret = -BCH_ERR_nocow_lock_blocked; + goto out_put_dev_refs; } if (bkey_extent_is_unwritten(k)) { ret = bch2_update_unwritten_extent(trans, m) ?: -BCH_ERR_data_update_done_unwritten; - goto out; + goto out_nocow_unlock; } /* write path might have to decompress data: */ @@ -807,9 +810,18 @@ int bch2_data_update_init(struct btree_trans *trans, return 0; enomem: ret = -ENOMEM; -out: - bch2_data_update_exit(m); - return ret ?: -BCH_ERR_data_update_done; + kfree(m->bvecs); + m->bvecs = NULL; +out_nocow_unlock: + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); +out_put_dev_refs: + bkey_put_dev_refs(c, k); +out_put_disk_res: + bch2_disk_reservation_put(c, &m->op.res); +out_bkey_buf_exit: + bch2_bkey_buf_exit(&m->k, c); + return ret; } void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) From 7e9ed60f5fe58dd4b4b6dcf63e57154c6262a2af Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 19 Jan 2025 13:55:33 -0500 Subject: [PATCH 025/180] bcachefs: Bail out early on alloc_nowait data updates If a data update doesn't want to block on allocations (promotes, self healing on read error) - check if the allocation would fail before kicking off the data update and calling into the write path. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 19 +--------------- fs/bcachefs/alloc_foreground.h | 17 +++++++++++++++ fs/bcachefs/data_update.c | 40 ++++++++++++++++++++++++++++++++++ fs/bcachefs/errcode.h | 1 + 4 files changed, 59 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 1a539e7bedc8..1759c15a7745 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -179,23 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -static inline unsigned open_buckets_reserved(enum bch_watermark watermark) -{ - switch (watermark) { - case BCH_WATERMARK_interior_updates: - return 0; - case BCH_WATERMARK_reclaim: - return OPEN_BUCKETS_COUNT / 6; - case BCH_WATERMARK_btree: - case BCH_WATERMARK_btree_copygc: - return OPEN_BUCKETS_COUNT / 4; - case BCH_WATERMARK_copygc: - return OPEN_BUCKETS_COUNT / 3; - default: - return OPEN_BUCKETS_COUNT / 2; - } -} - static inline bool may_alloc_bucket(struct bch_fs *c, struct bpos bucket, struct bucket_alloc_state *s) @@ -239,7 +222,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * spin_lock(&c->freelist_lock); - if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index f25481a0d1a0..baf5dc163c8a 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -33,6 +33,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) return bch2_dev_have_ref(c, ob->dev); } +static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) +{ + switch (watermark) { + case BCH_WATERMARK_interior_updates: + return 0; + case BCH_WATERMARK_reclaim: + return OPEN_BUCKETS_COUNT / 6; + case BCH_WATERMARK_btree: + case BCH_WATERMARK_btree_copygc: + return OPEN_BUCKETS_COUNT / 4; + case BCH_WATERMARK_copygc: + return OPEN_BUCKETS_COUNT / 3; + default: + return OPEN_BUCKETS_COUNT / 2; + } +} + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum bch_watermark, enum bch_data_type, struct closure *); diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 3e8ad94dca59..ec63dd494c80 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -639,6 +639,40 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } +static bool can_allocate_without_blocking(struct bch_fs *c, + struct data_update *m) +{ + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) + return false; + + unsigned target = m->op.flags & BCH_WRITE_only_specified_devs + ? m->op.target + : 0; + struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); + + darray_for_each(m->op.devs_have, i) + __clear_bit(*i, devs.d); + + rcu_read_lock(); + unsigned nr_replicas = 0, i; + for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { + struct bch_dev *ca = bch2_dev_rcu(c, i); + + struct bch_dev_usage usage; + bch2_dev_usage_read_fast(ca, &usage); + + if (!dev_buckets_free(ca, usage, m->op.watermark)) + continue; + + nr_replicas += ca->mi.durability; + if (nr_replicas >= m->op.nr_replicas) + break; + } + rcu_read_unlock(); + + return nr_replicas >= m->op.nr_replicas; +} + int bch2_data_update_init(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, @@ -759,6 +793,12 @@ int bch2_data_update_init(struct btree_trans *trans, goto out_bkey_buf_exit; } + if ((m->op.flags & BCH_WRITE_alloc_nowait) && + !can_allocate_without_blocking(c, m)) { + ret = -BCH_ERR_data_update_done_would_block; + goto out_bkey_buf_exit; + } + if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, m->data_opts.extra_replicas diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 82f950ea1c26..1e8f65f95d60 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -181,6 +181,7 @@ x(EINVAL, not_in_recovery) \ x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ + x(BCH_ERR_data_update_done, data_update_done_would_block) \ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ From 4dfb76e0ad22d959ecb477f1e982500047ce38a4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 17 Jan 2025 19:26:10 -0500 Subject: [PATCH 026/180] bcachefs: Don't start promotes from bch2_rbio_free() we don't want to block completion of the read - starting a promote calls into the write path, which will block. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index bdb554f6db8c..15494aba4547 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -80,6 +80,7 @@ struct promote_op { struct rhash_head hash; struct bpos pos; + struct work_struct work; struct data_update write; struct bio_vec bi_inline_vecs[]; /* must be last */ }; @@ -149,13 +150,21 @@ static void promote_done(struct bch_write_op *wop) promote_free(&op->write.rbio); } +static void promote_start_work(struct work_struct *work) +{ + struct promote_op *op = container_of(work, struct promote_op, work); + + bch2_data_update_read_done(&op->write); +} + static noinline void promote_start(struct bch_read_bio *rbio) { struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); trace_and_count(op->write.op.c, read_promote, &rbio->bio); - bch2_data_update_read_done(&op->write); + INIT_WORK(&op->work, promote_start_work); + queue_work(rbio->c->write_ref_wq, &op->work); } static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, From 7b1d6551060066a1fed2a1f83485b0ea37ca3001 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 19 Jan 2025 20:34:57 -0500 Subject: [PATCH 027/180] bcachefs: Don't self-heal if a data update is already rewriting Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 68 ++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 15494aba4547..bb5d1de25aa1 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -97,6 +97,26 @@ static inline bool have_io_error(struct bch_io_failures *failed) return failed && failed->nr; } +static bool ptr_being_rewritten(struct bch_read_bio *orig, + unsigned dev, + unsigned flags) +{ + if (!(flags & BCH_READ_data_update)) + return false; + + struct data_update *u = container_of(orig, struct data_update, rbio); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev && + u->data_opts.rewrite_ptrs & BIT(i)) + return true; + i++; + } + + return false; +} + static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, struct bpos pos, struct bch_io_opts opts, @@ -173,12 +193,36 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, struct bpos pos, struct extent_ptr_decoded *pick, unsigned sectors, + unsigned flags, struct bch_read_bio *orig, struct bch_io_failures *failed) { struct bch_fs *c = trans->c; int ret; + struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; + + if (!have_io_error(failed)) { + update_opts.target = orig->opts.promote_target; + update_opts.extra_replicas = 1; + update_opts.write_flags |= BCH_WRITE_cached; + update_opts.write_flags |= BCH_WRITE_only_specified_devs; + } else { + update_opts.target = orig->opts.foreground_target; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned ptr_bit = 1; + bkey_for_each_ptr(ptrs, ptr) { + if (bch2_dev_io_failures(failed, ptr->dev) && + !ptr_being_rewritten(orig, ptr->dev, flags)) + update_opts.rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + if (!update_opts.rewrite_ptrs) + return NULL; + } + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); @@ -197,25 +241,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, goto err; } - struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; - - if (!have_io_error(failed)) { - update_opts.target = orig->opts.promote_target; - update_opts.extra_replicas = 1; - update_opts.write_flags |= BCH_WRITE_cached; - update_opts.write_flags |= BCH_WRITE_only_specified_devs; - } else { - update_opts.target = orig->opts.foreground_target; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - bkey_for_each_ptr(ptrs, ptr) { - if (bch2_dev_io_failures(failed, ptr->dev)) - update_opts.rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - } - ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), orig->opts, @@ -283,7 +308,10 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_reflink : BTREE_ID_extents, - k, pos, pick, sectors, orig, failed); + k, pos, pick, sectors, flags, orig, failed); + if (!promote) + return NULL; + ret = PTR_ERR_OR_ZERO(promote); if (ret) goto nopromote; From dff6de9518848b5afa0bc6fec57e657701be67ec Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 30 Dec 2024 16:32:57 -0500 Subject: [PATCH 028/180] bcachefs: Internal reads can now correct errors Rework the read path so that BCH_READ_NODECODE reads now also self-heal after a read error and a successful retry - prerequisite for scrub. - __bch2_read_endio() now handles a read that's both BCH_READ_NODECODE and a bounce. Normally, we don't want a BCH_READ_NODECODE read to ever allocate a split bch_read_bio: we want to maintain the relationship between the bch_read_bio and the data_update it's embedded in. But correcting read errors requires allocating a split/bounce rbio that's embedded in a promote_op. We do still have a 1-1 relationship, i.e. we only allocate a single split/bounce if it's a BCH_READ_NODECODE, so things hopefully don't get too crazy. - __bch2_read_extent() now is allowed to allocate the promote_op for rewriting after a failed read, even if it's BCH_READ_NODECODE. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 108 ++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index bb5d1de25aa1..18c8e54f455e 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -696,32 +696,40 @@ static void __bch2_read_endio(struct work_struct *work) if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); - if (rbio->flags & BCH_READ_data_update) - goto nodecode; + if (likely(!(rbio->flags & BCH_READ_data_update))) { + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + if (crc_is_compressed(crc)) { + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) + goto decompression_err; + } else { + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); - - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; - - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; + if (rbio->split) + rbio->parent->pick = rbio->pick; if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; @@ -739,7 +747,7 @@ static void __bch2_read_endio(struct work_struct *work) if (ret) goto decrypt_err; } -nodecode: + if (likely(!(rbio->flags & BCH_READ_in_retry))) { rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); @@ -931,13 +939,35 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto retry_pick; } - if (flags & BCH_READ_data_update) { - struct data_update *u = container_of(orig, struct data_update, rbio); + if (!(flags & BCH_READ_data_update)) { + if (!(flags & BCH_READ_last_fragment) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_must_clone; + narrow_crcs = !(flags & BCH_READ_in_retry) && + bch2_can_narrow_extent_crcs(k, pick.crc); + + if (narrow_crcs && (flags & BCH_READ_user_mapped)) + flags |= BCH_READ_must_bounce; + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_user_mapped)) || + (flags & BCH_READ_must_bounce)))) { + read_full = true; + bounce = true; + } + } else { + read_full = true; /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ + struct data_update *u = container_of(orig, struct data_update, rbio); if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) percpu_ref_put(&ca->io_ref); @@ -945,29 +975,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } iter.bi_size = pick.crc.compressed_size << 9; - goto get_bio; - } - - if (!(flags & BCH_READ_last_fragment) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_must_clone; - - narrow_crcs = !(flags & BCH_READ_in_retry) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_user_mapped)) - flags |= BCH_READ_must_bounce; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_user_mapped)) || - (flags & BCH_READ_must_bounce)))) { - read_full = true; - bounce = true; } if (orig->opts.promote_target || have_io_error(failed)) @@ -991,7 +998,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, pick.crc.offset = 0; pick.crc.live_size = bvec_iter_sectors(iter); } -get_bio: + if (rbio) { /* * promote already allocated bounce rbio: @@ -1055,9 +1062,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->version = k.k->bversion; INIT_WORK(&rbio->work, NULL); - if (flags & BCH_READ_data_update) - orig->pick = pick; - rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; From ca16fa6b860fe35ba97dc28bb1792b02767c01de Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 28 Dec 2024 16:20:38 -0500 Subject: [PATCH 029/180] bcachefs: backpointer_get_key() doesn't pull in btree node We may not need to pull in a btree node when walking backpointers - don't do so unnecessarily when using backpointer_get_key(). It'll still fall back to backpointer_get_node() in a few situations, including btree roots (where an iterator can't point at just the key), and races due to the interior update path not having deleted a backpointer to an old node yet. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 1d30066e63dc..3aff2b24de4a 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -244,27 +244,31 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) return bkey_s_c_null; - if (likely(!bp.v->level)) { - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, 0, - iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - if (k.k && - extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) - return k; - + bch2_trans_node_iter_init(trans, iter, + bp.v->btree_id, + bp.v->pos, + 0, + bp.v->level, + iter_flags); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); + return k; + } + + if (k.k && + extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) + return k; + + bch2_trans_iter_exit(trans, iter); + + if (!bp.v->level) { int ret = backpointer_target_not_found(trans, bp, k, last_flushed); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); + if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) + return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) return ((struct bkey_s_c) { .k = ERR_CAST(b) }); From 12188c9e2b3464faa81ba571b027ffb276c7e497 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 28 Dec 2024 21:00:34 -0500 Subject: [PATCH 030/180] bcachefs: bch2_btree_node_rewrite_pos() Add a new helper for rewriting a btree node given a just the key, not a pointer to the node itself. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 20 ++++++++++++++++++++ fs/bcachefs/btree_update_interior.h | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 05aa9e32adf4..d3e0cf01ba37 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2238,6 +2238,26 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans, return ret; } +int bch2_btree_node_rewrite_pos(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bpos pos, unsigned flags) +{ + BUG_ON(!level); + + /* Traverse one depth lower to get a pointer to the node itself: */ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + ret = bch2_btree_node_rewrite(trans, &iter, b, flags); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, struct btree *b, unsigned flags) { diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 9261a9a341fb..be71cd73b864 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -169,10 +169,14 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, struct btree *, unsigned); +int bch2_btree_node_rewrite_pos(struct btree_trans *, + enum btree_id, unsigned, + struct bpos, unsigned); int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, struct btree *, unsigned); void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); + int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, struct btree *, struct bkey_i *, unsigned, bool); From 987fdbdb40293557e59bc27c4bc89ecd715b8019 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 28 Dec 2024 10:40:11 -0500 Subject: [PATCH 031/180] bcachefs: bch2_move_data_phys() Add a more general version of bch2_evacuate_bucket - to be used for scrub. Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 4 +- fs/bcachefs/move.c | 112 ++++++++++++++++++++++++++------------- fs/bcachefs/move_types.h | 15 +++++- fs/bcachefs/trace.h | 52 ------------------ 4 files changed, 90 insertions(+), 93 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index bab49d5ee598..bc1f91bf3e16 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -313,7 +313,7 @@ static int bch2_data_thread(void *arg) struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); - ctx->stats.data_type = U8_MAX; + ctx->stats.done = true; return 0; } @@ -333,7 +333,7 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { .type = BCH_DATA_EVENT_PROGRESS, - .p.data_type = ctx->stats.data_type, + .p.data_type = ctx->stats.done ? U8_MAX : ctx->stats.data_type, .p.btree_id = ctx->stats.pos.btree, .p.pos = ctx->stats.pos.pos, .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index d825493cac25..cfcde0e33177 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -655,21 +655,21 @@ int bch2_move_data(struct bch_fs *c, bool wait_on_copygc, move_pred_fn pred, void *arg) { - struct moving_context ctxt; - int ret; bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_move_data(&ctxt, start, end, pred, arg); + int ret = __bch2_move_data(&ctxt, start, end, pred, arg); bch2_moving_ctxt_exit(&ctxt); return ret; } -int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts _data_opts) +static int __bch2_move_data_phys(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + unsigned dev, + u64 bucket_start, + u64 bucket_end, + move_pred_fn pred, void *arg) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -678,16 +678,20 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; struct bkey_s_c k; - struct data_update_opts data_opts; unsigned sectors_moved = 0; struct bkey_buf last_flushed; int ret = 0; - struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); + struct bch_dev *ca = bch2_dev_tryget(c, dev); if (!ca) return 0; - trace_bucket_evacuate(c, &bucket); + bucket_end = min(bucket_end, ca->mi.nbuckets); + + struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); + struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); + bch2_dev_put(ca); + ca = NULL; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -698,8 +702,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, */ bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket), 0); + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); bch_err_msg(c, ret, "looking up alloc key"); if (ret) @@ -723,7 +726,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (ret) goto err; - if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) + if (!k.k || bkey_gt(k.k->p, bp_end)) break; if (k.k->type != KEY_TYPE_backpointer) @@ -731,6 +734,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + if (ctxt->stats) + ctxt->stats->offset = + bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + if (!bp.v->level) { k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); ret = bkey_err(k); @@ -741,34 +748,22 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (!k.k) goto next; - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; } - data_opts = _data_opts; - data_opts.target = io_opts.background_target; - data_opts.rewrite_ptrs = 0; + struct data_update_opts data_opts = {}; + if (!pred(c, arg, k, &io_opts, &data_opts)) { + bch2_trans_iter_exit(trans, &iter); + goto next; + } + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ - unsigned i = 0; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { - if (p.ptr.dev == bucket.inode) { - if (p.ptr.cached) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } - data_opts.rewrite_ptrs |= 1U << i; - break; - } - i++; - } ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); @@ -801,6 +796,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (!b) goto next; + struct data_update_opts data_opts = {}; + if (!pred(c, arg, bkey_i_to_s_c(&b->key), &io_opts, &data_opts)) { + bch2_trans_iter_exit(trans, &iter); + goto next; + } + unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); ret = bch2_btree_node_rewrite(trans, &iter, b, 0); @@ -817,21 +818,58 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, atomic64_add(sectors, &ctxt->stats->sectors_seen); atomic64_add(sectors, &ctxt->stats->sectors_moved); } - sectors_moved += btree_sectors(c); + sectors_moved += sectors; } next: bch2_btree_iter_advance(&bp_iter); } - - trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); err: bch2_trans_iter_exit(trans, &bp_iter); - bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); bch2_bkey_buf_exit(&last_flushed, c); return ret; } +struct evacuate_bucket_arg { + struct bpos bucket; + int gen; + struct data_update_opts data_opts; +}; + +static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct evacuate_bucket_arg *arg = _arg; + + *data_opts = arg->data_opts; + + unsigned i = 0; + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (ptr->dev == arg->bucket.inode && + (arg->gen < 0 || arg->gen == ptr->gen) && + !ptr->cached) + data_opts->rewrite_ptrs |= BIT(i); + i++; + } + + return data_opts->rewrite_ptrs != 0; +} + +int bch2_evacuate_bucket(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts data_opts) +{ + struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; + + return __bch2_move_data_phys(ctxt, bucket_in_flight, + bucket.inode, + bucket.offset, + bucket.offset + 1, + evacuate_bucket_pred, &arg); +} + typedef bool (*move_btree_pred)(struct bch_fs *, void *, struct btree *, struct bch_io_opts *, struct data_update_opts *); diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index e22841ef31e4..15d1f7f3d1dc 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -5,9 +5,20 @@ #include "bbpos_types.h" struct bch_move_stats { - enum bch_data_type data_type; - struct bbpos pos; char name[32]; + bool phys; + bool done; + + union { + struct { + enum bch_data_type data_type; + struct bbpos pos; + }; + struct { + unsigned dev; + u64 offset; + }; + }; atomic64_t keys_moved; atomic64_t keys_raced; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index c1b51009edf6..2f25dcfc0e25 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -797,27 +797,6 @@ TRACE_EVENT(bucket_invalidate, /* Moving IO */ -TRACE_EVENT(bucket_evacuate, - TP_PROTO(struct bch_fs *c, struct bpos *bucket), - TP_ARGS(c, bucket), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, dev_idx ) - __field(u64, bucket ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->dev_idx = bucket->inode; - __entry->bucket = bucket->offset; - ), - - TP_printk("%d:%d %u:%llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->dev_idx, __entry->bucket) -); - DEFINE_EVENT(fs_str, move_extent, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) @@ -881,37 +860,6 @@ TRACE_EVENT(move_data, __entry->sectors_raced) ); -TRACE_EVENT(evacuate_bucket, - TP_PROTO(struct bch_fs *c, struct bpos *bucket, - unsigned sectors, unsigned bucket_size, - int ret), - TP_ARGS(c, bucket, sectors, bucket_size, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, member ) - __field(u64, bucket ) - __field(u32, sectors ) - __field(u32, bucket_size ) - __field(int, ret ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->member = bucket->inode; - __entry->bucket = bucket->offset; - __entry->sectors = sectors; - __entry->bucket_size = bucket_size; - __entry->ret = ret; - ), - - TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->member, __entry->bucket, - __entry->sectors, __entry->bucket_size, - __entry->ret) -); - TRACE_EVENT(copygc, TP_PROTO(struct bch_fs *c, u64 buckets, From 2a2f7aaa8d3151bde9111e6be1254e1f160d1566 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 28 Dec 2024 21:04:36 -0500 Subject: [PATCH 032/180] bcachefs: __bch2_move_data_phys() now uses bch2_btree_node_rewrite_pos() Kill most of the separate logic for btree nodes. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 121 +++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 77 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index cfcde0e33177..ff396b33ef24 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -735,91 +735,58 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); if (ctxt->stats) - ctxt->stats->offset = - bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!k.k) + goto next; if (!bp.v->level) { - k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!k.k) - goto next; - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; } - - struct data_update_opts data_opts = {}; - if (!pred(c, arg, k, &io_opts, &data_opts)) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } - - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ - - ret = bch2_move_extent(ctxt, bucket_in_flight, - &iter, k, io_opts, data_opts); - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret == -ENOMEM) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - if (ret) - goto err; - - if (ctxt->stats) - atomic64_add(sectors, &ctxt->stats->sectors_seen); - sectors_moved += sectors; - } else { - struct btree *b; - - b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); - ret = PTR_ERR_OR_ZERO(b); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - goto next; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!b) - goto next; - - struct data_update_opts data_opts = {}; - if (!pred(c, arg, bkey_i_to_s_c(&b->key), &io_opts, &data_opts)) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } - - unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, sectors); - if (ctxt->stats) { - atomic64_add(sectors, &ctxt->stats->sectors_seen); - atomic64_add(sectors, &ctxt->stats->sectors_moved); - } - sectors_moved += sectors; } + + struct data_update_opts data_opts = {}; + if (!pred(c, arg, k, &io_opts, &data_opts)) { + bch2_trans_iter_exit(trans, &iter); + goto next; + } + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + /* move_extent will drop locks */ + unsigned sectors = !bp.v->level + ? bp.v->bucket_len + : btree_ptr_sectors_written(k); + + ret = !bp.v->level + ? bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts) + : bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + continue; + } + if (ret) + goto err; + + if (ctxt->stats) + atomic64_add(sectors, &ctxt->stats->sectors_seen); + sectors_moved += sectors; next: bch2_btree_iter_advance(&bp_iter); } From ca24130ee412d991ef9925bf1b507f973daa9740 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 28 Dec 2024 19:58:47 -0500 Subject: [PATCH 033/180] bcachefs: bch2_bkey_pick_read_device() can now specify a device To be used for scrub, where we want the read to come from a specific device. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 4 ++-- fs/bcachefs/debug.c | 2 +- fs/bcachefs/extents.c | 9 +++++++-- fs/bcachefs/extents.h | 2 +- fs/bcachefs/io_read.c | 8 ++++---- fs/bcachefs/io_read.h | 4 ++-- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 756736f9243d..fc1c01fd2d8d 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1352,7 +1352,7 @@ static void btree_node_read_work(struct work_struct *work) can_retry = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - &failed, &rb->pick) > 0; + &failed, &rb->pick, -1) > 0; if (!bio->bi_status && !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { @@ -1697,7 +1697,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, return; ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - NULL, &pick); + NULL, &pick, -1); if (ret <= 0) { struct printbuf buf = PRINTBUF; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 03a3b62d19a9..788af88f6979 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -191,7 +191,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, unsigned offset = 0; int ret; - if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { prt_printf(out, "error getting device to read from: invalid device\n"); return; } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 2d8042f853dc..ec653109de5b 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -114,8 +114,9 @@ static inline bool ptr_better(struct bch_fs *c, * other devices, it will still pick a pointer from avoid. */ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_failures *failed, - struct extent_ptr_decoded *pick) + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick, + int dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -137,6 +138,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, break; } + /* Are we being asked to read from a specific device? */ + if (dev >= 0 && p.ptr.dev != dev) + continue; + /* * If there are any dirty pointers it's an error if we can't * read: diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 204d765dd74c..ed160aaa9546 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -404,7 +404,7 @@ void bch2_mark_io_failure(struct bch_io_failures *, struct extent_ptr_decoded *); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, int); /* KEY_TYPE_btree_ptr: */ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 18c8e54f455e..673641331482 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -436,7 +436,7 @@ static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_ bkey_start_pos(&u->k.k->k), u->btree_id, bkey_i_to_s_c(u->k.k), - 0, failed, flags); + 0, failed, flags, -1); err: bch2_trans_iter_exit(trans, &iter); @@ -872,7 +872,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, unsigned flags, int dev) { struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; @@ -893,7 +893,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto out_read_done; } retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); /* hole or reservation - just zero fill: */ if (!pick_ret) @@ -1250,7 +1250,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, data_btree, k, - offset_into_extent, failed, flags); + offset_into_extent, failed, flags, -1); if (ret) goto err; diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index f54c9943e34a..5142f2818b33 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -129,7 +129,7 @@ enum bch_read_flags { int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, struct bpos, enum btree_id, struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned); + struct bch_io_failures *, unsigned, int); static inline void bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bpos read_pos, @@ -137,7 +137,7 @@ static inline void bch2_read_extent(struct btree_trans *trans, unsigned offset_into_extent, unsigned flags) { __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags); + data_btree, k, offset_into_extent, NULL, flags, -1); } void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, From 3e2ad29865f279f7e9837e4eeb8518509e97333f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 30 Dec 2024 16:24:23 -0500 Subject: [PATCH 034/180] bcachefs: bch2_btree_node_scrub() Add a function for scrubbing btree nodes - reading them in, and kicking off a rewrite if there's an error. The btree_node_read_done() checks have to be duplicated because we're not using a pointer to a struct btree - the btree node might already be in cache, and we need to check a specific replica, which might not be the one we previously read from. This will be used in the next patch implementing high-level scrub. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 3 +- fs/bcachefs/btree_io.c | 185 +++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/btree_io.h | 3 + fs/bcachefs/errcode.h | 1 + 4 files changed, 191 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 161cf2f05d2a..13acfbf3852a 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -687,7 +687,8 @@ struct btree_trans_buf { x(gc_gens) \ x(snapshot_delete_pagecache) \ x(sysfs) \ - x(btree_write_buffer) + x(btree_write_buffer) \ + x(btree_node_scrub) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index fc1c01fd2d8d..91c624db2958 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "bkey_sort.h" #include "btree_cache.h" @@ -1811,6 +1812,190 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); } +struct btree_node_scrub { + struct bch_fs *c; + struct bch_dev *ca; + void *buf; + bool used_mempool; + unsigned written; + + enum btree_id btree; + unsigned level; + struct bkey_buf key; + __le64 seq; + + struct work_struct work; + struct bio bio; +}; + +static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, + struct printbuf *err) +{ + unsigned written = 0; + + if (le64_to_cpu(data->magic) != bset_magic(c)) { + prt_printf(err, "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(data->magic)); + return false; + } + + while (written < (ptr_written ?: btree_sectors(c))) { + struct btree_node_entry *bne; + struct bset *i; + bool first = !written; + + if (first) { + bne = NULL; + i = &data->keys; + } else { + bne = (void *) data + (written << 9); + i = &bne->keys; + + if (!ptr_written && i->seq != data->keys.seq) + break; + } + + struct nonce nonce = btree_nonce(i, written << 9); + bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); + + if (first) { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); + if (bch2_crc_cmp(data->csum, csum)) { + bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); + return false; + } + } + + written += vstruct_sectors(data, c->block_bits); + } else { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + if (bch2_crc_cmp(bne->csum, csum)) { + bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); + return false; + } + } + + written += vstruct_sectors(bne, c->block_bits); + } + } + + return true; +} + +static void btree_node_scrub_work(struct work_struct *work) +{ + struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); + struct bch_fs *c = scrub->c; + struct printbuf err = PRINTBUF; + + __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, + bkey_i_to_s_c(scrub->key.k)); + prt_newline(&err); + + if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { + struct btree_trans *trans = bch2_trans_get(c); + + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, scrub->btree, + scrub->key.k->k.p, 0, scrub->level - 1, 0); + + struct btree *b; + int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter))); + if (ret) + goto err; + + if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { + bch_err(c, "error validating btree node during scrub on %s at btree %s", + scrub->ca->name, err.buf); + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + bch2_trans_begin(trans); + bch2_trans_put(trans); + } + + printbuf_exit(&err); + bch2_bkey_buf_exit(&scrub->key, c);; + btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); + percpu_ref_put(&scrub->ca->io_ref); + kfree(scrub); + bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); +} + +static void btree_node_scrub_endio(struct bio *bio) +{ + struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); + + queue_work(scrub->c->btree_read_complete_wq, &scrub->work); +} + +int bch2_btree_node_scrub(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c k, unsigned dev) +{ + if (k.k->type != KEY_TYPE_btree_ptr_v2) + return 0; + + struct bch_fs *c = trans->c; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) + return -BCH_ERR_erofs_no_writes; + + struct extent_ptr_decoded pick; + int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); + if (ret <= 0) + goto err; + + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) { + ret = -BCH_ERR_device_offline; + goto err; + } + + bool used_mempool = false; + void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); + + unsigned vecs = buf_pages(buf, c->opts.btree_node_size); + + struct btree_node_scrub *scrub = + kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); + if (!scrub) { + ret = -ENOMEM; + goto err_free; + } + + scrub->c = c; + scrub->ca = ca; + scrub->buf = buf; + scrub->used_mempool = used_mempool; + scrub->written = btree_ptr_sectors_written(k); + + scrub->btree = btree; + scrub->level = level; + bch2_bkey_buf_init(&scrub->key); + bch2_bkey_buf_reassemble(&scrub->key, c, k); + scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; + + INIT_WORK(&scrub->work, btree_node_scrub_work); + + bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); + bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); + scrub->bio.bi_iter.bi_sector = pick.ptr.offset; + scrub->bio.bi_end_io = btree_node_scrub_endio; + submit_bio(&scrub->bio); + return 0; +err_free: + btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); + percpu_ref_put(&ca->io_ref); +err: + bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); + return ret; +} + static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 6f9e4a6dacf7..75ead3815d67 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -132,6 +132,9 @@ void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); +int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, unsigned); + bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); enum btree_write_flags { diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 1e8f65f95d60..20bfdee42309 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -275,6 +275,7 @@ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ x(EIO, insufficient_journal_devices) \ + x(EIO, device_offline) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ From f269ae55d2de9c6aff5b289cd94c8eaab7b9b2c3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 28 Dec 2024 19:59:55 -0500 Subject: [PATCH 035/180] bcachefs: Scrub Add a new data op to walk all data and metadata in a filesystem, checking if it can be read successfully, and on error repairing from another copy if possible. - New helper: bch2_dev_idx_is_online(), so that we can bail out and report to userspace when we're unable to scrub because the device is offline - data_update_opts, which controls the data move path, now understands scrub: data is only read, not written. The read path is responsible for rewriting on read error, as with other reads. - scrub_pred skips data extents that don't have checksums - bch_ioctl_data has a new scrub member, which has a data_types field for data types to check - i.e. all data types, or only metadata. - Add new entries to bch_move_stats so that we can report numbers for corrected and uncorrected errors - Add a new enum to bch_ioctl_data_event for explicitly reporting completion and return code (i.e. device offline) Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_ioctl.h | 14 +++- fs/bcachefs/chardev.c | 33 ++++++-- fs/bcachefs/data_update.c | 72 ++++++++++------- fs/bcachefs/data_update.h | 8 +- fs/bcachefs/io_read.c | 4 +- fs/bcachefs/io_read.h | 1 + fs/bcachefs/move.c | 151 ++++++++++++++++++++++++++++++----- fs/bcachefs/move_types.h | 5 +- fs/bcachefs/sb-members.h | 12 +++ 9 files changed, 239 insertions(+), 61 deletions(-) diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index f1b746fac007..e8a89d375d2f 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -214,6 +214,10 @@ struct bch_ioctl_data { struct bpos end_pos; union { + struct { + __u32 dev; + __u32 data_types; + } scrub; struct { __u32 dev; __u32 pad; @@ -238,11 +242,19 @@ struct bch_ioctl_data_progress { __u64 sectors_done; __u64 sectors_total; + __u64 sectors_error_corrected; + __u64 sectors_error_uncorrected; } __packed __aligned(8); +enum bch_ioctl_data_event_ret { + BCH_IOCTL_DATA_EVENT_RET_done = 1, + BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, +}; + struct bch_ioctl_data_event { __u8 type; - __u8 pad[7]; + __u8 ret; + __u8 pad[6]; union { struct bch_ioctl_data_progress p; __u64 pad2[15]; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index bc1f91bf3e16..b38a3c6fe04c 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -313,7 +313,10 @@ static int bch2_data_thread(void *arg) struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); - ctx->stats.done = true; + if (ctx->thr.ret == -BCH_ERR_device_offline) + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; + else + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; return 0; } @@ -332,14 +335,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { - .type = BCH_DATA_EVENT_PROGRESS, - .p.data_type = ctx->stats.done ? U8_MAX : ctx->stats.data_type, - .p.btree_id = ctx->stats.pos.btree, - .p.pos = ctx->stats.pos.pos, - .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_total = bch2_fs_usage_read_short(c).used, + .type = BCH_DATA_EVENT_PROGRESS, + .ret = ctx->stats.ret, + .p.data_type = ctx->stats.data_type, + .p.btree_id = ctx->stats.pos.btree, + .p.pos = ctx->stats.pos.pos, + .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), + .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), }; + if (ctx->arg.op == BCH_DATA_OP_scrub) { + struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); + if (ca) { + struct bch_dev_usage u; + bch2_dev_usage_read_fast(ca, &u); + for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) + if (ctx->arg.scrub.data_types & BIT(i)) + e.p.sectors_total += u.d[i].sectors; + bch2_dev_put(ca); + } + } else { + e.p.sectors_total = bch2_fs_usage_read_short(c).used; + } + if (len < sizeof(e)) return -EINVAL; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index ec63dd494c80..9b79cd18d16c 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -673,12 +673,46 @@ static bool can_allocate_without_blocking(struct bch_fs *c, return nr_replicas >= m->op.nr_replicas; } +int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); + + m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); + if (!m->bvecs) + return -ENOMEM; + + bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); + bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); + + if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { + kfree(m->bvecs); + m->bvecs = NULL; + return -ENOMEM; + } + + rbio_init(&m->rbio.bio, c, *io_opts, NULL); + m->rbio.bio.bi_iter.bi_size = buf_bytes; + m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); + m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + return 0; +} + int bch2_data_update_init(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, struct data_update *m, struct write_point_specifier wp, - struct bch_io_opts io_opts, + struct bch_io_opts *io_opts, struct data_update_opts data_opts, enum btree_id btree_id, struct bkey_s_c k) @@ -705,7 +739,7 @@ int bch2_data_update_init(struct btree_trans *trans, m->ctxt = ctxt; m->stats = ctxt ? ctxt->stats : NULL; - bch2_write_op_init(&m->op, c, io_opts); + bch2_write_op_init(&m->op, c, *io_opts); m->op.pos = bkey_start_pos(k.k); m->op.version = k.k->bversion; m->op.target = data_opts.target; @@ -716,7 +750,7 @@ int bch2_data_update_init(struct btree_trans *trans, BCH_WRITE_data_encoded| BCH_WRITE_move| m->data_opts.write_flags; - m->op.compression_opt = io_opts.background_compression; + m->op.compression_opt = io_opts->background_compression; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; unsigned durability_have = 0, durability_removing = 0; @@ -754,7 +788,7 @@ int bch2_data_update_init(struct btree_trans *trans, ptr_bit <<= 1; } - unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); + unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); /* * If current extent durability is less than io_opts.data_replicas, @@ -787,7 +821,7 @@ int bch2_data_update_init(struct btree_trans *trans, m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); + ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); if (!ret) ret = -BCH_ERR_data_update_done_no_writes_needed; goto out_bkey_buf_exit; @@ -825,33 +859,11 @@ int bch2_data_update_init(struct btree_trans *trans, goto out_nocow_unlock; } - /* write path might have to decompress data: */ - unsigned buf_bytes = 0; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - - unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); - - m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); - if (!m->bvecs) - goto enomem; - - bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); - bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); - - if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) - goto enomem; - - rbio_init(&m->rbio.bio, c, io_opts, NULL); - m->rbio.bio.bi_iter.bi_size = buf_bytes; - m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + ret = bch2_data_update_bios_init(m, c, io_opts); + if (ret) + goto out_nocow_unlock; return 0; -enomem: - ret = -ENOMEM; - kfree(m->bvecs); - m->bvecs = NULL; out_nocow_unlock: if (c->opts.nocow_enabled) bkey_nocow_unlock(c, k); diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index f4cf5d17cc37..c194cbbf5b51 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -16,6 +16,9 @@ struct data_update_opts { u8 extra_replicas; unsigned btree_insert_flags; unsigned write_flags; + + int read_dev; + bool scrub; }; void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, @@ -48,12 +51,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *, struct bch_io_opts *, struct data_update_opts *); +int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, + struct bch_io_opts *); + void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, struct moving_context *, struct data_update *, struct write_point_specifier, - struct bch_io_opts, struct data_update_opts, + struct bch_io_opts *, struct data_update_opts, enum btree_id, struct bkey_s_c); void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 673641331482..cb30bdf52284 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -243,7 +243,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), - orig->opts, + &orig->opts, update_opts, btree_id, k); /* @@ -488,6 +488,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, blk_status_t error) { rbio->retry = retry; + rbio->saw_error = true; if (rbio->flags & BCH_READ_in_retry) return; @@ -969,6 +970,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, */ struct data_update *u = container_of(orig, struct data_update, rbio); if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { + BUG(); if (ca) percpu_ref_put(&ca->io_ref); goto hole; diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 5142f2818b33..73275da5d2c4 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -41,6 +41,7 @@ struct bch_read_bio { have_ioref:1, narrow_crcs:1, hole:1, + saw_error:1, retry:2, context:2; }; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index ff396b33ef24..7614370f4590 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -89,7 +89,12 @@ static void move_free(struct moving_io *io) wake_up(&ctxt->wait); mutex_unlock(&ctxt->lock); - bch2_data_update_exit(&io->write); + if (!io->write.data_opts.scrub) { + bch2_data_update_exit(&io->write); + } else { + bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); + kfree(io->write.bvecs); + } kfree(io); } @@ -109,7 +114,20 @@ static void move_write_done(struct bch_write_op *op) static void move_write(struct moving_io *io) { - if (unlikely(io->write.rbio.bio.bi_status || io->write.rbio.hole)) { + struct moving_context *ctxt = io->write.ctxt; + + if (ctxt->stats) { + if (io->write.rbio.bio.bi_status) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_uncorrected); + else if (io->write.rbio.saw_error) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_corrected); + } + + if (unlikely(io->write.rbio.bio.bi_status || + io->write.rbio.hole || + io->write.data_opts.scrub)) { move_free(io); return; } @@ -263,7 +281,8 @@ int bch2_move_extent(struct moving_context *ctxt, bch2_data_update_opts_normalize(k, &data_opts); if (!data_opts.rewrite_ptrs && - !data_opts.extra_replicas) { + !data_opts.extra_replicas && + !data_opts.scrub) { if (data_opts.kill_ptrs) return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); return 0; @@ -284,16 +303,28 @@ int bch2_move_extent(struct moving_context *ctxt, io->read_sectors = k.k->size; io->write_sectors = k.k->size; - ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, - io_opts, data_opts, iter->btree_id, k); - if (ret) - goto err_free; + if (!data_opts.scrub) { + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, + &io_opts, data_opts, iter->btree_id, k); + if (ret) + goto err_free; + + io->write.op.end_io = move_write_done; + } else { + bch2_bkey_buf_init(&io->write.k); + bch2_bkey_buf_reassemble(&io->write.k, c, k); + + io->write.op.c = c; + io->write.data_opts = data_opts; + + ret = bch2_data_update_bios_init(&io->write, c, &io_opts); + if (ret) + goto err_free; + } io->write.rbio.bio.bi_end_io = move_read_endio; io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - io->write.op.end_io = move_write_done; - if (ctxt->rate) bch2_ratelimit_increment(ctxt->rate, k.k->size); @@ -324,11 +355,14 @@ int bch2_move_extent(struct moving_context *ctxt, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(trans, &io->write.rbio, - bkey_start_pos(k.k), - iter->btree_id, k, 0, - BCH_READ_data_update| - BCH_READ_last_fragment); + __bch2_read_extent(trans, &io->write.rbio, + io->write.rbio.bio.bi_iter, + bkey_start_pos(k.k), + iter->btree_id, k, 0, + NULL, + BCH_READ_data_update| + BCH_READ_last_fragment, + data_opts.scrub ? data_opts.read_dev : -1); return 0; err_free: kfree(io); @@ -669,6 +703,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, unsigned dev, u64 bucket_start, u64 bucket_end, + unsigned data_types, move_pred_fn pred, void *arg) { struct btree_trans *trans = ctxt->trans; @@ -737,6 +772,9 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (ctxt->stats) ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + if (!(data_types & BIT(bp.v->data_type))) + goto next; + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -760,17 +798,25 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, goto next; } + if (data_opts.scrub && + !bch2_dev_idx_is_online(c, data_opts.read_dev)) { + bch2_trans_iter_exit(trans, &iter); + ret = -BCH_ERR_device_offline; + break; + } + bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); /* move_extent will drop locks */ - unsigned sectors = !bp.v->level - ? bp.v->bucket_len - : btree_ptr_sectors_written(k); + unsigned sectors = bp.v->bucket_len; - ret = !bp.v->level - ? bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts) - : bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + if (!bp.v->level) + ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); + else if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + else + ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); bch2_trans_iter_exit(trans, &iter); @@ -797,6 +843,30 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, return ret; } +static int bch2_move_data_phys(struct bch_fs *c, + unsigned dev, + u64 start, + u64 end, + unsigned data_types, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ + struct moving_context ctxt; + + bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ctxt.stats->phys = true; + + int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); + bch2_moving_ctxt_exit(&ctxt); + + return ret; +} + struct evacuate_bucket_arg { struct bpos bucket; int gen; @@ -834,6 +904,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bucket.inode, bucket.offset, bucket.offset + 1, + ~0, evacuate_bucket_pred, &arg); } @@ -1075,6 +1146,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } +static bool scrub_pred(struct bch_fs *c, void *_arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct bch_ioctl_data *arg = _arg; + + if (k.k->type != KEY_TYPE_btree_ptr_v2) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == arg->migrate.dev) { + if (!p.crc.csum_type) + return false; + break; + } + } + + data_opts->scrub = true; + data_opts->read_dev = arg->migrate.dev; + return true; +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) @@ -1089,6 +1184,22 @@ int bch2_data_job(struct bch_fs *c, bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); switch (op.op) { + case BCH_DATA_OP_scrub: + /* + * prevent tests from spuriously failing, make sure we see all + * btree nodes that need to be repaired + */ + bch2_btree_interior_updates_flush(c); + + ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, + op.scrub.data_types, + NULL, + stats, + writepoint_hashed((unsigned long) current), + false, + scrub_pred, &op) ?: ret; + break; + case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index 15d1f7f3d1dc..82e473ed48d2 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -3,11 +3,12 @@ #define _BCACHEFS_MOVE_TYPES_H #include "bbpos_types.h" +#include "bcachefs_ioctl.h" struct bch_move_stats { char name[32]; bool phys; - bool done; + enum bch_ioctl_data_event_ret ret; union { struct { @@ -25,6 +26,8 @@ struct bch_move_stats { atomic64_t sectors_seen; atomic64_t sectors_moved; atomic64_t sectors_raced; + atomic64_t sectors_error_corrected; + atomic64_t sectors_error_uncorrected; }; struct move_bucket_key { diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 762083b564ee..b29b6c6c21dd 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -23,6 +23,18 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) return !percpu_ref_is_zero(&ca->io_ref); } +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); + +static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + bool ret = ca && bch2_dev_is_online(ca); + rcu_read_unlock(); + + return ret; +} + static inline bool bch2_dev_is_readable(struct bch_dev *ca) { return bch2_dev_is_online(ca) && From 7d8321a286de288778e175ea5b967dceaeed9c96 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Mon, 27 Jan 2025 17:12:41 +0800 Subject: [PATCH 036/180] bcachefs: Fix subtraction underflow When ancestor is less than IS_ANCESTOR_BITMAP, we would get an incorrect result. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index c54091a28909..ede0b480e7d4 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -146,8 +146,9 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) goto out; } - while (id && id < ancestor - IS_ANCESTOR_BITMAP) - id = get_ancestor_below(t, id, ancestor); + if (likely(ancestor >= IS_ANCESTOR_BITMAP)) + while (id && id < ancestor - IS_ANCESTOR_BITMAP) + id = get_ancestor_below(t, id, ancestor); ret = id && id < ancestor ? test_ancestor_bitmap(t, id, ancestor) From 157ea5834133c02cb93e06e6a014cfc0b3b109e5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 3 Feb 2025 20:15:52 -0500 Subject: [PATCH 037/180] bcachefs: Read/move path counter work Reorganize counters a bit, grouping related counters together. New counters: - io_read_inline - io_read_hole Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 26 +++++++++++++------------- fs/bcachefs/io_read.c | 20 ++++++++++++++------ fs/bcachefs/move.c | 29 ++++++++++++++--------------- fs/bcachefs/sb-counters_format.h | 22 ++++++++++++---------- fs/bcachefs/trace.h | 24 ++++++++++++------------ 5 files changed, 65 insertions(+), 56 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 9b79cd18d16c..7e484afea551 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -93,7 +93,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc return true; } -static noinline void trace_move_extent_finish2(struct data_update *u, +static noinline void trace_io_move_finish2(struct data_update *u, struct bkey_i *new, struct bkey_i *insert) { @@ -113,11 +113,11 @@ static noinline void trace_move_extent_finish2(struct data_update *u, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); prt_newline(&buf); - trace_move_extent_finish(c, buf.buf); + trace_io_move_finish(c, buf.buf); printbuf_exit(&buf); } -static void trace_move_extent_fail2(struct data_update *m, +static void trace_io_move_fail2(struct data_update *m, struct bkey_s_c new, struct bkey_s_c wrote, struct bkey_i *insert, @@ -128,7 +128,7 @@ static void trace_move_extent_fail2(struct data_update *m, struct printbuf buf = PRINTBUF; unsigned rewrites_found = 0; - if (!trace_move_extent_fail_enabled()) + if (!trace_io_move_fail_enabled()) return; prt_str(&buf, msg); @@ -168,7 +168,7 @@ static void trace_move_extent_fail2(struct data_update *m, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); } - trace_move_extent_fail(c, buf.buf); + trace_io_move_fail(c, buf.buf); printbuf_exit(&buf); } @@ -216,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, new = bkey_i_to_extent(bch2_keylist_front(keys)); if (!bch2_extents_match(k, old)) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), NULL, "no match:"); goto nowork; } @@ -256,7 +256,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (m->data_opts.rewrite_ptrs && !rewrites_found && bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); goto nowork; } @@ -273,7 +273,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, } if (!bkey_val_u64s(&new->k)) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); goto nowork; } @@ -387,9 +387,9 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (!ret) { bch2_btree_iter_set_pos(&iter, next_pos); - this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); - if (trace_move_extent_finish_enabled()) - trace_move_extent_finish2(m, &new->k_i, insert); + this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); + if (trace_io_move_finish_enabled()) + trace_io_move_finish2(m, &new->k_i, insert); } err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -411,7 +411,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, &m->stats->sectors_raced); } - count_event(c, move_extent_fail); + count_event(c, io_move_fail); bch2_btree_iter_advance(&iter); goto next; @@ -439,7 +439,7 @@ void bch2_data_update_read_done(struct data_update *m) m->op.crc = m->rbio.pick.crc; m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; - this_cpu_add(m->op.c->counters[BCH_COUNTER_move_extent_write], m->k.k->k.size); + this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); closure_call(&m->op.cl, bch2_write, NULL, NULL); } diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index cb30bdf52284..33642c5bb9c7 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -181,7 +181,7 @@ static noinline void promote_start(struct bch_read_bio *rbio) { struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - trace_and_count(op->write.op.c, read_promote, &rbio->bio); + trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); INIT_WORK(&op->work, promote_start_work); queue_work(rbio->c->write_ref_wq, &op->work); @@ -320,7 +320,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, *read_full = promote_full; return promote; nopromote: - trace_read_nopromote(c, ret); + trace_io_read_nopromote(c, ret); return NULL; } @@ -463,7 +463,9 @@ static void bch2_rbio_retry(struct work_struct *work) }; struct bch_io_failures failed = { .nr = 0 }; - trace_and_count(c, read_retry, &rbio->bio); + trace_io_read_retry(&rbio->bio); + this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], + bvec_iter_sectors(rbio->bvec_iter)); if (rbio->retry == READ_RETRY_AVOID) bch2_mark_io_failure(&failed, &rbio->pick); @@ -802,7 +804,7 @@ static void bch2_read_endio(struct bio *bio) if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { - trace_and_count(c, read_reuse_race, &rbio->bio); + trace_and_count(c, io_read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_retry_if_stale) bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); @@ -891,6 +893,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, swap(iter.bi_size, bytes); bio_advance_iter(&orig->bio, &iter, bytes); zero_fill_bio_iter(&orig->bio, iter); + this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], + bvec_iter_sectors(iter)); goto out_read_done; } retry_pick: @@ -1069,10 +1073,12 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->bio.bi_end_io = bch2_read_endio; if (rbio->bounce) - trace_and_count(c, read_bounce, &rbio->bio); + trace_and_count(c, io_read_bounce, &rbio->bio); if (!(flags & BCH_READ_data_update)) this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + else + this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); /* @@ -1085,7 +1091,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { bio_inc_remaining(&orig->bio); - trace_and_count(c, read_split, &orig->bio); + trace_and_count(c, io_read_split, &orig->bio); } /* @@ -1173,6 +1179,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto out_read_done; hole: + this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], + bvec_iter_sectors(iter)); /* * won't normally happen in the BCH_READ_data_update * (bch2_move_extent()) path, but if we retry and the extent we wanted diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 7614370f4590..1be1edfbc830 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, +static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - if (trace_move_extent_enabled()) { + if (trace_io_move_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_move_extent(c, buf.buf); + trace_io_move(c, buf.buf); printbuf_exit(&buf); } } -static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) +static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) { - if (trace_move_extent_read_enabled()) { + if (trace_io_move_read_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_read(c, buf.buf); + trace_io_move_read(c, buf.buf); printbuf_exit(&buf); } } @@ -132,12 +132,12 @@ static void move_write(struct moving_io *io) return; } - if (trace_move_extent_write_enabled()) { + if (trace_io_move_write_enabled()) { struct bch_fs *c = io->write.op.c; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); - trace_move_extent_write(c, buf.buf); + trace_io_move_write(c, buf.buf); printbuf_exit(&buf); } @@ -273,7 +273,8 @@ int bch2_move_extent(struct moving_context *ctxt, struct bch_fs *c = trans->c; int ret = -ENOMEM; - trace_move_extent2(c, k, &io_opts, &data_opts); + trace_io_move2(c, k, &io_opts, &data_opts); + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); if (ctxt->stats) ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); @@ -338,9 +339,7 @@ int bch2_move_extent(struct moving_context *ctxt, atomic_inc(&io->b->count); } - this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); - this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); - trace_move_extent_read2(c, k); + trace_io_move_read2(c, k); mutex_lock(&ctxt->lock); atomic_add(io->read_sectors, &ctxt->read_sectors); @@ -374,15 +373,15 @@ int bch2_move_extent(struct moving_context *ctxt, bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; - count_event(c, move_extent_start_fail); + count_event(c, io_move_start_fail); - if (trace_move_extent_start_fail_enabled()) { + if (trace_io_move_start_fail_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, ": "); prt_str(&buf, bch2_err_str(ret)); - trace_move_extent_start_fail(c, buf.buf); + trace_io_move_start_fail(c, buf.buf); printbuf_exit(&buf); } return ret; diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index d0391c5d4c48..c82a891026d3 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -9,8 +9,20 @@ enum counters_flags { #define BCH_PERSISTENT_COUNTERS() \ x(io_read, 0, TYPE_SECTORS) \ + x(io_read_inline, 80, TYPE_SECTORS) \ + x(io_read_hole, 81, TYPE_SECTORS) \ + x(io_read_promote, 30, TYPE_COUNTER) \ + x(io_read_bounce, 31, TYPE_COUNTER) \ + x(io_read_split, 33, TYPE_COUNTER) \ + x(io_read_reuse_race, 34, TYPE_COUNTER) \ + x(io_read_retry, 32, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ + x(io_move_read, 35, TYPE_SECTORS) \ + x(io_move_write, 36, TYPE_SECTORS) \ + x(io_move_finish, 37, TYPE_SECTORS) \ + x(io_move_fail, 38, TYPE_COUNTER) \ + x(io_move_start_fail, 39, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ x(bucket_discard_fast, 79, TYPE_COUNTER) \ @@ -39,16 +51,6 @@ enum counters_flags { x(journal_reclaim_finish, 27, TYPE_COUNTER) \ x(journal_reclaim_start, 28, TYPE_COUNTER) \ x(journal_write, 29, TYPE_COUNTER) \ - x(read_promote, 30, TYPE_COUNTER) \ - x(read_bounce, 31, TYPE_COUNTER) \ - x(read_split, 33, TYPE_COUNTER) \ - x(read_retry, 32, TYPE_COUNTER) \ - x(read_reuse_race, 34, TYPE_COUNTER) \ - x(move_extent_read, 35, TYPE_SECTORS) \ - x(move_extent_write, 36, TYPE_SECTORS) \ - x(move_extent_finish, 37, TYPE_SECTORS) \ - x(move_extent_fail, 38, TYPE_COUNTER) \ - x(move_extent_start_fail, 39, TYPE_COUNTER) \ x(copygc, 40, TYPE_COUNTER) \ x(copygc_wait, 41, TYPE_COUNTER) \ x(gc_gens_end, 42, TYPE_COUNTER) \ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 2f25dcfc0e25..5718988dd7d6 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -295,12 +295,12 @@ TRACE_EVENT(write_super, /* io.c: */ -DEFINE_EVENT(bio, read_promote, +DEFINE_EVENT(bio, io_read_promote, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -TRACE_EVENT(read_nopromote, +TRACE_EVENT(io_read_nopromote, TP_PROTO(struct bch_fs *c, int ret), TP_ARGS(c, ret), @@ -319,22 +319,22 @@ TRACE_EVENT(read_nopromote, __entry->ret) ); -DEFINE_EVENT(bio, read_bounce, +DEFINE_EVENT(bio, io_read_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_split, +DEFINE_EVENT(bio, io_read_split, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_retry, +DEFINE_EVENT(bio, io_read_retry, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_reuse_race, +DEFINE_EVENT(bio, io_read_reuse_race, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); @@ -797,32 +797,32 @@ TRACE_EVENT(bucket_invalidate, /* Moving IO */ -DEFINE_EVENT(fs_str, move_extent, +DEFINE_EVENT(fs_str, io_move, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_read, +DEFINE_EVENT(fs_str, io_move_read, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_write, +DEFINE_EVENT(fs_str, io_move_write, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_finish, +DEFINE_EVENT(fs_str, io_move_finish, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_fail, +DEFINE_EVENT(fs_str, io_move_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_start_fail, +DEFINE_EVENT(fs_str, io_move_start_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); From e63cf203d781902a8d3af3333ccf6382f8c3d416 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Jan 2025 11:45:22 -0500 Subject: [PATCH 038/180] bcachefs: Convert migrate to move_data_phys() Iterating over backpointers on a specific device is potentially much cheaper than walking all filesystem data. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_ioctl.h | 5 +++++ fs/bcachefs/chardev.c | 4 +++- fs/bcachefs/move.c | 25 +++++++++---------------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index e8a89d375d2f..52594e925eb7 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -234,6 +234,11 @@ enum bch_data_event { BCH_DATA_EVENT_NR = 1, }; +enum data_progress_data_type_special { + DATA_PROGRESS_DATA_TYPE_phys = 254, + DATA_PROGRESS_DATA_TYPE_done = 255, +}; + struct bch_ioctl_data_progress { __u8 data_type; __u8 btree_id; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index b38a3c6fe04c..57d55b3ddc71 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -315,8 +315,10 @@ static int bch2_data_thread(void *arg) ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); if (ctx->thr.ret == -BCH_ERR_device_offline) ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; - else + else { ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; + ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; + } return 0; } diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 1be1edfbc830..12519181026f 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -859,6 +859,7 @@ static int bch2_move_data_phys(struct bch_fs *c, bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); ctxt.stats->phys = true; + ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); bch2_moving_ctxt_exit(&ctxt); @@ -1048,14 +1049,6 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } -static bool migrate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); -} - /* * Ancient versions of bcachefs produced packed formats which could represent * keys that the in memory format cannot represent; this checks for those @@ -1218,14 +1211,14 @@ int bch2_data_job(struct bch_fs *c, stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - ret = bch2_move_btree(c, start, end, - migrate_btree_pred, &op, stats) ?: ret; - ret = bch2_move_data(c, start, end, - NULL, - stats, - writepoint_hashed((unsigned long) current), - true, - migrate_pred, &op) ?: ret; + ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, + ~0, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + migrate_pred, &op) ?: ret; + bch2_btree_interior_updates_flush(c); ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_rewrite_old_nodes: From 45f0e6c838e5d9af3f013adb4ba9aad3bcbcbe3b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 01:33:01 -0500 Subject: [PATCH 039/180] bcachefs: bch2_indirect_extent_missing_error() prints path, not just inode number We want all error messages converted to print paths, not just inode numbers - users want this information, and it speeds up debugging too. Auditing and converting all error messages is going to be a big project, so for the moment we're just doing this incrementally. Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 32 +++++++++++++++++++++++++++++++- fs/bcachefs/error.h | 3 +++ fs/bcachefs/reflink.c | 21 +++++++++++++++------ fs/bcachefs/snapshot.c | 2 +- fs/bcachefs/snapshot.h | 1 + 5 files changed, 51 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 038da6a61f6b..14cfcfa39590 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -535,7 +535,6 @@ int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, sub u32 restart_count = trans->restart_count; int ret = 0; - /* XXX: we don't yet attempt to print paths when we don't know the subvol */ if (inum.subvol) ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); if (!inum.subvol || ret) @@ -562,3 +561,34 @@ void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, { bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); } + +int bch2_inum_snap_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bpos pos) +{ + u32 restart_count = trans->restart_count; + struct bch_fs *c = trans->c; + int ret = 0; + + if (!bch2_snapshot_is_leaf(c, pos.snapshot)) + prt_str(out, "(multiple snapshots) "); + + subvol_inum inum = { + .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot), + .inum = pos.inode, + }; + + if (inum.subvol) + ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); + if (!inum.subvol || ret) + prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); + + return trans_was_restarted(trans, restart_count); +} + +int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bpos pos) +{ + int ret = bch2_inum_snap_err_msg_trans(trans, out, pos); + prt_printf(out, " offset %llu: ", pos.offset << 8); + return ret; +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 7acf2a27ca28..95cf48a31dbf 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -244,4 +244,7 @@ int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subv void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); +int bch2_inum_snap_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); +int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); + #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 441e648f28b5..50118661e64b 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -185,12 +185,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, BUG_ON(missing_start < refd_start); BUG_ON(missing_end > refd_end); - if (fsck_err(trans, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - missing_start, missing_end)) { + struct bpos missing_pos = bkey_start_pos(p.k); + missing_pos.offset += missing_start - live_start; + + prt_printf(&buf, "pointer to missing indirect extent in "); + ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); + if (ret) + goto err; + + prt_printf(&buf, "-%llu\n ", (missing_pos.offset + (missing_end - missing_start)) << 9); + bch2_bkey_val_to_text(&buf, c, p.s_c); + + prt_printf(&buf, "\n missing reflink btree range %llu-%llu", + missing_start, missing_end); + + if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) { struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); ret = PTR_ERR_OR_ZERO(new); if (ret) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index ede0b480e7d4..e7f197896db1 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -390,7 +390,7 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) return 0; } -static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) { u32 id = snapshot_root; u32 subvol = 0, s; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 00373cf32e7b..81180181d7c9 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) return id; } +u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32); u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) From 06284963e3d86c5e0cf56982c2d947eeb0f30871 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 13:37:30 -0500 Subject: [PATCH 040/180] bcachefs: bch2_inum_offset_err_msg_trans() no longer handles transaction restarts we're starting to use error messages with paths in fsck_errors(), where we do not want nested transaction restart handling, so let's prepare for that. Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 48 ++++++++++++++---------------------- fs/bcachefs/error.h | 3 --- fs/bcachefs/fs-io-buffered.c | 3 ++- fs/bcachefs/io_misc.c | 3 ++- fs/bcachefs/io_read.c | 11 ++++++--- 5 files changed, 29 insertions(+), 39 deletions(-) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 14cfcfa39590..c8fc58fab958 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -530,42 +530,33 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_unlock(&c->fsck_error_msgs_lock); } -int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) +int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + subvol_inum inum, u64 offset) { u32 restart_count = trans->restart_count; int ret = 0; - if (inum.subvol) - ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); + if (inum.subvol) { + ret = bch2_inum_to_path(trans, inum, out); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + } if (!inum.subvol || ret) prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); + prt_printf(out, " offset %llu: ", offset); return trans_was_restarted(trans, restart_count); } -int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - subvol_inum inum, u64 offset) -{ - int ret = bch2_inum_err_msg_trans(trans, out, inum); - prt_printf(out, " offset %llu: ", offset); - return ret; -} - -void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) -{ - bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); -} - void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum, u64 offset) { - bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); + bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); } -int bch2_inum_snap_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - struct bpos pos) +int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bpos pos) { - u32 restart_count = trans->restart_count; struct bch_fs *c = trans->c; int ret = 0; @@ -577,18 +568,15 @@ int bch2_inum_snap_err_msg_trans(struct btree_trans *trans, struct printbuf *out .inum = pos.inode, }; - if (inum.subvol) - ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); + if (inum.subvol) { + ret = bch2_inum_to_path(trans, inum, out); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + } + if (!inum.subvol || ret) prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); - return trans_was_restarted(trans, restart_count); -} - -int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - struct bpos pos) -{ - int ret = bch2_inum_snap_err_msg_trans(trans, out, pos); prt_printf(out, " offset %llu: ", pos.offset << 8); - return ret; + return 0; } diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 95cf48a31dbf..76da0e88cee8 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -238,13 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); _ret; \ }) -int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); -void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); -int bch2_inum_snap_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 0ec2eebdeffa..5ab1c73c8d4c 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -240,7 +240,8 @@ static void bchfs_read(struct btree_trans *trans, if (ret) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); prt_printf(&buf, "read error %i from btree lookup", ret); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 5353979117b0..6b842c8d21be 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -115,7 +115,8 @@ int bch2_extent_fallocate(struct btree_trans *trans, bch2_increment_clock(c, sectors_allocated, WRITE); if (should_print_err(ret)) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 33642c5bb9c7..dcd5a2aee0f1 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -329,9 +329,10 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bch_read_bio *rbio, struct bpos read_pos) { - return bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { rbio->subvol, read_pos.inode }, - read_pos.offset << 9); + return lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { rbio->subvol, read_pos.inode }, + read_pos.offset << 9)); } static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, @@ -1281,7 +1282,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, if (ret) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, + bvec_iter.bi_sector << 9)); prt_printf(&buf, "read error %i from btree lookup", ret); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); From baabeb499758706a9093c610c5d97cf6de5f649a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Feb 2025 15:59:28 -0500 Subject: [PATCH 041/180] bcachefs: Factor out progress.[ch] the backpointers code has progress indicators; these aren't great, since they print to the dmesg console and we much prefer to have progress indicators reporting to a specific userspace program so they're not spamming the system console. But not all codepaths that need progress indicators support that yet, and we don't want users to think "this is hung". Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/backpointers.c | 74 +++----------------------------------- fs/bcachefs/backpointers.h | 4 +-- fs/bcachefs/progress.c | 63 ++++++++++++++++++++++++++++++++ fs/bcachefs/progress.h | 29 +++++++++++++++ 5 files changed, 100 insertions(+), 71 deletions(-) create mode 100644 fs/bcachefs/progress.c create mode 100644 fs/bcachefs/progress.h diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index d2689388d5e8..1cf17a16af9f 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -67,6 +67,7 @@ bcachefs-y := \ nocow_locking.o \ opts.o \ printbuf.o \ + progress.o \ quota.o \ rebalance.o \ rcu_pending.o \ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 3aff2b24de4a..bb799b86aa69 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -11,6 +11,7 @@ #include "checksum.h" #include "disk_accounting.h" #include "error.h" +#include "progress.h" #include @@ -735,71 +736,6 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, return ret; } -struct progress_indicator_state { - unsigned long next_print; - u64 nodes_seen; - u64 nodes_total; - struct btree *last_node; -}; - -static inline void progress_init(struct progress_indicator_state *s, - struct bch_fs *c, - u64 btree_id_mask) -{ - memset(s, 0, sizeof(*s)); - - s->next_print = jiffies + HZ * 10; - - for (unsigned i = 0; i < BTREE_ID_NR; i++) { - if (!(btree_id_mask & BIT_ULL(i))) - continue; - - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = i, - }; - - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - s->nodes_total += div64_ul(v, btree_sectors(c)); - } -} - -static inline bool progress_update_p(struct progress_indicator_state *s) -{ - bool ret = time_after_eq(jiffies, s->next_print); - - if (ret) - s->next_print = jiffies + HZ * 10; - return ret; -} - -static void progress_update_iter(struct btree_trans *trans, - struct progress_indicator_state *s, - struct btree_iter *iter, - const char *msg) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(btree_iter_path(trans, iter))->b; - - s->nodes_seen += b != s->last_node; - s->last_node = b; - - if (progress_update_p(s)) { - struct printbuf buf = PRINTBUF; - unsigned percent = s->nodes_total - ? div64_u64(s->nodes_seen * 100, s->nodes_total) - : 0; - - prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", - msg, percent, s->nodes_seen, s->nodes_total); - bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -} - static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct extents_to_bp_state *s) { @@ -807,7 +743,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct progress_indicator_state progress; int ret = 0; - progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); + bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); @@ -826,7 +762,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); + bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); })); @@ -1226,11 +1162,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); - progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); + bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_prefetch, k, ({ - progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); + bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); check_one_backpointer(trans, start, end, k, &last_flushed); })); diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 060dad1521ee..5c6a17c21769 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H -#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#ifndef _BCACHEFS_BACKPOINTERS_H +#define _BCACHEFS_BACKPOINTERS_H #include "btree_cache.h" #include "btree_iter.h" diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c new file mode 100644 index 000000000000..bafd1c91a802 --- /dev/null +++ b/fs/bcachefs/progress.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bbpos.h" +#include "disk_accounting.h" +#include "progress.h" + +void bch2_progress_init(struct progress_indicator_state *s, + struct bch_fs *c, + u64 btree_id_mask) +{ + memset(s, 0, sizeof(*s)); + + s->next_print = jiffies + HZ * 10; + + for (unsigned i = 0; i < BTREE_ID_NR; i++) { + if (!(btree_id_mask & BIT_ULL(i))) + continue; + + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_btree, + .btree.id = i, + }; + + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + s->nodes_total += div64_ul(v, btree_sectors(c)); + } +} + +static inline bool progress_update_p(struct progress_indicator_state *s) +{ + bool ret = time_after_eq(jiffies, s->next_print); + + if (ret) + s->next_print = jiffies + HZ * 10; + return ret; +} + +void bch2_progress_update_iter(struct btree_trans *trans, + struct progress_indicator_state *s, + struct btree_iter *iter, + const char *msg) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(btree_iter_path(trans, iter))->b; + + s->nodes_seen += b != s->last_node; + s->last_node = b; + + if (progress_update_p(s)) { + struct printbuf buf = PRINTBUF; + unsigned percent = s->nodes_total + ? div64_u64(s->nodes_seen * 100, s->nodes_total) + : 0; + + prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", + msg, percent, s->nodes_seen, s->nodes_total); + bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); + + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } +} diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h new file mode 100644 index 000000000000..23fb1811f943 --- /dev/null +++ b/fs/bcachefs/progress.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_PROGRESS_H +#define _BCACHEFS_PROGRESS_H + +/* + * Lame progress indicators + * + * We don't like to use these because they print to the dmesg console, which is + * spammy - we much prefer to be wired up to a userspace programm (e.g. via + * thread_with_file) and have it print the progress indicator. + * + * But some code is old and doesn't support that, or runs in a context where + * that's not yet practical (mount). + */ + +struct progress_indicator_state { + unsigned long next_print; + u64 nodes_seen; + u64 nodes_total; + struct btree *last_node; +}; + +void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); +void bch2_progress_update_iter(struct btree_trans *, + struct progress_indicator_state *, + struct btree_iter *, + const char *); + +#endif /* _BCACHEFS_PROGRESS_H */ From 491eda63947335e4f779443b524ae6086b0de052 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Feb 2025 16:25:29 -0500 Subject: [PATCH 042/180] bcachefs: Add a progress indicator to bch2_dev_data_drop() This code needs quite a bit of work: we don't want to be walking all metadata in the filesystem, we should just be walking backpointers, and it should be switched to a data ioctl that can report progress via a file descriptor, not the system console. But that'll take more work - before we can safely walk only backpointers we need to change device add to not reuse device indexes, since with that change accounting being wrong introduces the possibility of removing a device that still has pointers. Signed-off-by: Kent Overstreet --- fs/bcachefs/migrate.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index ddc187fb693d..57ad662871ba 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -15,6 +15,7 @@ #include "keylist.h" #include "migrate.h" #include "move.h" +#include "progress.h" #include "replicas.h" #include "super-io.h" @@ -76,7 +77,9 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, return 0; } -static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int bch2_dev_usrdata_drop(struct bch_fs *c, + struct progress_indicator_state *progress, + unsigned dev_idx, int flags) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id id; @@ -88,8 +91,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); + bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); + })); if (ret) break; } @@ -99,7 +104,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) return ret; } -static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int bch2_dev_metadata_drop(struct bch_fs *c, + struct progress_indicator_state *progress, + unsigned dev_idx, int flags) { struct btree_trans *trans; struct btree_iter iter; @@ -125,6 +132,8 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) while (bch2_trans_begin(trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { + bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); + if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; @@ -169,6 +178,11 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) { - return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: - bch2_dev_metadata_drop(c, dev_idx, flags); + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, + BIT_ULL(BTREE_ID_extents)| + BIT_ULL(BTREE_ID_reflink)); + + return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: + bch2_dev_metadata_drop(c, &progress, dev_idx, flags); } From 3eccc02035f8d7f3674cba779f4fc6193d03de92 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 14:01:05 -0500 Subject: [PATCH 043/180] bcachefs: add progress indicator to check_allocations Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index dd1d9b74076e..ff681e733598 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -27,6 +27,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "progress.h" #include "recovery_passes.h" #include "reflink.h" #include "recovery.h" @@ -656,7 +657,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, return ret; } -static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) +static int bch2_gc_btree(struct btree_trans *trans, + struct progress_indicator_state *progress, + enum btree_id btree, bool initial) { struct bch_fs *c = trans->c; unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; @@ -673,6 +676,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + bch2_progress_update_iter(trans, progress, &iter, "check_allocations"); gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); })); @@ -717,22 +721,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) static int bch2_gc_btrees(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - enum btree_id ids[BTREE_ID_NR]; struct printbuf buf = PRINTBUF; - unsigned i; int ret = 0; - for (i = 0; i < BTREE_ID_NR; i++) + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, ~0ULL); + + enum btree_id ids[BTREE_ID_NR]; + for (unsigned i = 0; i < BTREE_ID_NR; i++) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { unsigned btree = i < BTREE_ID_NR ? ids[i] : i; if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) continue; - ret = bch2_gc_btree(trans, btree, true); + ret = bch2_gc_btree(trans, &progress, btree, true); } printbuf_exit(&buf); From c2be81d48a52f51d4260b37cb8fdc4a2762db59a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Jan 2025 13:43:15 -0500 Subject: [PATCH 044/180] bcachefs: Kill journal_res_state.unwritten_idx Dead code Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 3 +-- fs/bcachefs/journal_io.c | 13 +------------ fs/bcachefs/journal_types.h | 7 +++---- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 05b1250619ec..65d8cc5ff7d3 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1389,8 +1389,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; - j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); - j->reservations.unwritten_idx++; + j->reservations.idx = journal_cur_seq(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 11c39e0c34f4..f2ff28e6697c 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1611,7 +1611,6 @@ static CLOSURE_CALLBACK(journal_write_done) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - union journal_res_state old, new; u64 seq = le64_to_cpu(w->data->seq); int err = 0; @@ -1671,16 +1670,6 @@ static CLOSURE_CALLBACK(journal_write_done) if (j->watermark != BCH_WATERMARK_stripe) journal_reclaim_kick(&c->journal); - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - BUG_ON(journal_state_count(new, new.unwritten_idx)); - BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); - - new.unwritten_idx++; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - closure_wake_up(&w->wait); completed = true; } @@ -1695,7 +1684,7 @@ static CLOSURE_CALLBACK(journal_write_done) } if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && - new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); long delta = buf->expires - jiffies; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 1ef3a28ed6ab..c407f9ce4d24 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -98,9 +98,8 @@ union journal_res_state { }; struct { - u64 cur_entry_offset:20, + u64 cur_entry_offset:22, idx:2, - unwritten_idx:2, buf0_count:10, buf1_count:10, buf2_count:10, @@ -110,13 +109,13 @@ union journal_res_state { /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ +#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ /* * We stash some journal state as sentinal values in cur_entry_offset: * note - cur_entry_offset is in units of u64s */ -#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) +#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) #define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) From 199a3578edec89069f1e07cf92135c8fce50e706 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Jan 2025 14:02:44 -0500 Subject: [PATCH 045/180] bcachefs: Kill journal_res.idx More dead code. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 2 +- fs/bcachefs/journal.h | 11 ++++++----- fs/bcachefs/journal_types.h | 1 - 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 65d8cc5ff7d3..26886513e2d2 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -306,7 +306,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t bch2_journal_space_available(j); - __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); + __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); } void bch2_journal_halt(struct journal *j) diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 107f7f901cd9..1e5fcfe3624a 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -193,7 +193,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) static inline struct jset_entry * journal_res_entry(struct journal *j, struct journal_res *res) { - return vstruct_idx(j->buf[res->idx].data, res->offset); + return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); } static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, @@ -267,8 +267,9 @@ bool bch2_journal_entry_close(struct journal *); void bch2_journal_do_writes(struct journal *); void bch2_journal_buf_put_final(struct journal *, u64); -static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) { + unsigned idx = seq & JOURNAL_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -276,8 +277,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s bch2_journal_buf_put_final(j, seq); } -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +static inline void bch2_journal_buf_put(struct journal *j, u64 seq) { + unsigned idx = seq & JOURNAL_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -306,7 +308,7 @@ static inline void bch2_journal_res_put(struct journal *j, BCH_JSET_ENTRY_btree_keys, 0, 0, 0); - bch2_journal_buf_put(j, res->idx, res->seq); + bch2_journal_buf_put(j, res->seq); res->ref = 0; } @@ -361,7 +363,6 @@ static inline int journal_res_get_fast(struct journal *j, &old.v, new.v)); res->ref = true; - res->idx = old.idx; res->offset = old.cur_entry_offset; res->seq = le64_to_cpu(j->buf[old.idx].data->seq); return 1; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index c407f9ce4d24..43cd2a7e0f7f 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -82,7 +82,6 @@ struct journal_entry_pin { struct journal_res { bool ref; - u8 idx; u16 u64s; u32 offset; u64 seq; From 2e853fdbc74411643b00a14ee75ca234675a9bf2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Jan 2025 13:06:35 -0500 Subject: [PATCH 046/180] bcachefs: Don't touch journal_buf->data->seq in journal_res_get This is a small optimization, reducing the number of cachelines we touch in the fast path - and it's also necessary for the next patch that increases JOURNAL_BUF_NR. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 1e5fcfe3624a..e514d664b8ae 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -364,7 +364,10 @@ static inline int journal_res_get_fast(struct journal *j, res->ref = true; res->offset = old.cur_entry_offset; - res->seq = le64_to_cpu(j->buf[old.idx].data->seq); + res->seq = journal_cur_seq(j); + res->seq -= (res->seq - old.idx) & JOURNAL_BUF_MASK; + + EBUG_ON(res->seq != le64_to_cpu(j->buf[old.idx].data->seq)); return 1; } From 35282ce9e82f6e4c044e6a74b6fef45dd4996718 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Feb 2025 19:13:39 -0500 Subject: [PATCH 047/180] bcachefs: Free journal bufs when not in use Since we're increasing the number of 'struct journal_bufs', we don't want them all permanently holding onto buffers for the journal data - that'd be 16 * 2MB = 32MB, or potentially more. Add a single-element mempool (open coded, since buffer size varies), this also means we won't be hitting the memory allocator every time we open and close a journal entry/buffer. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 85 +++++++++++++++++++++++++++++-------- fs/bcachefs/journal_io.c | 17 +++++++- fs/bcachefs/journal_types.h | 3 ++ 3 files changed, 87 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 26886513e2d2..d47a4dfa03e3 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -58,9 +58,11 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); - prt_printf(out, "size:\t"); - prt_human_readable_u64(out, vstruct_bytes(buf->data)); - prt_newline(out); + if (buf->data) { + prt_printf(out, "size:\t"); + prt_human_readable_u64(out, vstruct_bytes(buf->data)); + prt_newline(out); + } prt_printf(out, "expires:\t"); prt_printf(out, "%li jiffies\n", buf->expires - jiffies); @@ -87,6 +89,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) { + lockdep_assert_held(&j->lock); + out->atomic++; + if (!out->nr_tabstops) printbuf_tabstop_push(out, 24); @@ -95,6 +100,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) seq++) bch2_journal_buf_to_text(out, j, seq); prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); + + --out->atomic; } static inline struct journal_buf * @@ -104,10 +111,8 @@ journal_seq_to_buf(struct journal *j, u64 seq) EBUG_ON(seq > journal_cur_seq(j)); - if (journal_seq_unwritten(j, seq)) { + if (journal_seq_unwritten(j, seq)) buf = j->buf + (seq & JOURNAL_BUF_MASK); - EBUG_ON(le64_to_cpu(buf->data->seq) != seq); - } return buf; } @@ -398,8 +403,16 @@ static int journal_entry_open(struct journal *j) return JOURNAL_ERR_insufficient_devices; /* -EROFS */ } + if (!j->free_buf && !buf->data) + return JOURNAL_ERR_enomem; /* will retry after write completion frees up a buf */ + BUG_ON(!j->cur_entry_sectors); + if (!buf->data) { + swap(buf->data, j->free_buf); + swap(buf->buf_size, j->free_buf_size); + } + buf->expires = (journal_cur_seq(j) == j->flushed_seq_ondisk ? jiffies @@ -514,6 +527,33 @@ static void journal_write_work(struct work_struct *work) spin_unlock(&j->lock); } +static void journal_buf_prealloc(struct journal *j) +{ + if (j->free_buf && + j->free_buf_size >= j->buf_size_want) + return; + + unsigned buf_size = j->buf_size_want; + + spin_unlock(&j->lock); + void *buf = kvmalloc(buf_size, GFP_NOFS); + spin_lock(&j->lock); + + if (buf && + (!j->free_buf || + buf_size > j->free_buf_size)) { + swap(buf, j->free_buf); + swap(buf_size, j->free_buf_size); + } + + if (unlikely(buf)) { + spin_unlock(&j->lock); + /* kvfree can sleep */ + kvfree(buf); + spin_lock(&j->lock); + } +} + static int __journal_res_get(struct journal *j, struct journal_res *res, unsigned flags) { @@ -544,6 +584,8 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, spin_lock(&j->lock); + journal_buf_prealloc(j); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call bch2_journal_entry_close() @@ -571,20 +613,26 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, can_discard = j->can_discard; spin_unlock(&j->lock); out: + if (likely(!ret)) + return 0; if (ret == JOURNAL_ERR_retry) goto retry; - if (!ret) - return 0; if (journal_error_check_stuck(j, ret, flags)) ret = -BCH_ERR_journal_res_get_blocked; if (ret == JOURNAL_ERR_max_in_flight && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { - + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && + trace_journal_entry_full_enabled()) { struct printbuf buf = PRINTBUF; + + bch2_printbuf_make_room(&buf, 4096); + + spin_lock(&j->lock); prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); bch2_journal_bufs_to_text(&buf, j); + spin_unlock(&j->lock); + trace_journal_entry_full(c, buf.buf); printbuf_exit(&buf); count_event(c, journal_entry_full); @@ -951,7 +999,8 @@ static void __bch2_journal_block(struct journal *j) new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); + if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) + journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); } } @@ -1481,6 +1530,7 @@ void bch2_fs_journal_exit(struct journal *j) for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) kvfree(j->buf[i].data); + kvfree(j->free_buf); free_fifo(&j->pin); } @@ -1507,13 +1557,13 @@ int bch2_fs_journal_init(struct journal *j) if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) return -BCH_ERR_ENOMEM_journal_pin_fifo; - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { - j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); - if (!j->buf[i].data) - return -BCH_ERR_ENOMEM_journal_buf; + j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; + j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); + if (!j->free_buf) + return -BCH_ERR_ENOMEM_journal_buf; + + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; - } j->pin.front = j->pin.back = 1; @@ -1563,6 +1613,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "average write size:\t"); prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); prt_newline(out); + prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index f2ff28e6697c..61f71e7baff2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1640,6 +1640,21 @@ static CLOSURE_CALLBACK(journal_write_done) j->err_seq = seq; w->write_done = true; + if (!j->free_buf || j->free_buf_size < w->buf_size) { + swap(j->free_buf, w->data); + swap(j->free_buf_size, w->buf_size); + } + + if (w->data) { + void *buf = w->data; + w->data = NULL; + w->buf_size = 0; + + spin_unlock(&j->lock); + kvfree(buf); + spin_lock(&j->lock); + } + bool completed = false; for (seq = journal_last_unwritten_seq(j); @@ -1649,7 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done) if (!w->write_done) break; - if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { + if (!j->err_seq && !w->noflush) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 43cd2a7e0f7f..ee9cb17c3ccf 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -156,6 +156,7 @@ enum journal_flags { x(journal_full) \ x(journal_pin_full) \ x(journal_stuck) \ + x(enomem) \ x(insufficient_devices) enum journal_errors { @@ -218,6 +219,8 @@ struct journal { * other is possibly being written out. */ struct journal_buf buf[JOURNAL_BUF_NR]; + void *free_buf; + unsigned free_buf_size; spinlock_t lock; From 898bda5b72a72a2617c877842c04b2a51c765a28 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Jan 2025 13:46:47 -0500 Subject: [PATCH 048/180] bcachefs: Increase JOURNAL_BUF_NR Increase journal pipelining. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/journal.c | 35 +++++++++++++++++++++++++++++++---- fs/bcachefs/journal.h | 32 ++++++++++++++++++++++---------- fs/bcachefs/journal_types.h | 8 +++++++- 4 files changed, 61 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 13acfbf3852a..9791bfe08895 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -456,6 +456,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_low_on_space) \ x(blocked_journal_low_on_pin) \ x(blocked_journal_max_in_flight) \ + x(blocked_journal_max_open) \ x(blocked_key_cache_flush) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index d47a4dfa03e3..40d3ad5a1e5c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -56,7 +56,12 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 prt_printf(out, "seq:\t%llu\n", seq); printbuf_indent_add(out, 2); - prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); + if (!buf->write_started) + prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); + + struct closure *cl = &buf->io; + int r = atomic_read(&cl->remaining); + prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); if (buf->data) { prt_printf(out, "size:\t"); @@ -200,7 +205,8 @@ void bch2_journal_do_writes(struct journal *j) if (w->write_started) continue; - if (!journal_state_count(j->reservations, idx)) { + if (!journal_state_seq_count(j, j->reservations, seq)) { + j->seq_write_started = seq; w->write_started = true; closure_call(&w->io, bch2_journal_write, j->wq, NULL); } @@ -396,6 +402,9 @@ static int journal_entry_open(struct journal *j) if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; + if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) + return JOURNAL_ERR_max_open; + if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) @@ -477,7 +486,7 @@ static int journal_entry_open(struct journal *j) new.idx++; BUG_ON(journal_state_count(new, new.idx)); - BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); + BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); journal_state_inc(&new); @@ -638,6 +647,23 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, count_event(c, journal_entry_full); } + if (ret == JOURNAL_ERR_max_open && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && + trace_journal_entry_full_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_printbuf_make_room(&buf, 4096); + + spin_lock(&j->lock); + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + spin_unlock(&j->lock); + + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + /* * Journal is full - can't rely on reclaim from work item due to * freezing: @@ -1041,7 +1067,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou *blocked = true; } - ret = journal_state_count(s, idx) > open + ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open ? ERR_PTR(-EAGAIN) : buf; break; @@ -1398,6 +1424,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; j->flushed_seq_ondisk = cur_seq - 1; + j->seq_write_started = cur_seq - 1; j->seq_ondisk = cur_seq - 1; j->pin.front = last_seq; j->pin.back = cur_seq; diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index e514d664b8ae..1c460ded2a11 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j) closure_wake_up(&j->async_wait); } -static inline struct journal_buf *journal_cur_buf(struct journal *j) -{ - return j->buf + j->reservations.idx; -} - /* Sequence number of oldest dirty journal entry */ static inline u64 journal_last_seq(struct journal *j) @@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j) return j->seq_ondisk + 1; } +static inline struct journal_buf *journal_cur_buf(struct journal *j) +{ + unsigned idx = (journal_cur_seq(j) & + JOURNAL_BUF_MASK & + ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx; + + return j->buf + idx; +} + static inline int journal_state_count(union journal_res_state s, int idx) { switch (idx) { @@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx) BUG(); } +static inline int journal_state_seq_count(struct journal *j, + union journal_res_state s, u64 seq) +{ + if (journal_cur_seq(j) - seq <= JOURNAL_STATE_BUF_NR) + return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); + else + return 0; +} + static inline void journal_state_inc(union journal_res_state *s) { s->buf0_count += s->idx == 0; @@ -269,7 +282,7 @@ void bch2_journal_buf_put_final(struct journal *, u64); static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) { - unsigned idx = seq & JOURNAL_BUF_MASK; + unsigned idx = seq & JOURNAL_STATE_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -279,7 +292,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) static inline void bch2_journal_buf_put(struct journal *j, u64 seq) { - unsigned idx = seq & JOURNAL_BUF_MASK; + unsigned idx = seq & JOURNAL_STATE_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -365,9 +378,7 @@ static inline int journal_res_get_fast(struct journal *j, res->ref = true; res->offset = old.cur_entry_offset; res->seq = journal_cur_seq(j); - res->seq -= (res->seq - old.idx) & JOURNAL_BUF_MASK; - - EBUG_ON(res->seq != le64_to_cpu(j->buf[old.idx].data->seq)); + res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK; return 1; } @@ -394,6 +405,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re (flags & JOURNAL_RES_GET_NONBLOCK) != 0, NULL, _THIS_IP_); EBUG_ON(!res->ref); + BUG_ON(!res->seq); } return 0; } diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index ee9cb17c3ccf..a0b17c6ed83e 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -12,7 +12,11 @@ /* btree write buffer steals 8 bits for its own purposes: */ #define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) -#define JOURNAL_BUF_BITS 2 +#define JOURNAL_STATE_BUF_BITS 2 +#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS) +#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1) + +#define JOURNAL_BUF_BITS 4 #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) @@ -153,6 +157,7 @@ enum journal_flags { x(retry) \ x(blocked) \ x(max_in_flight) \ + x(max_open) \ x(journal_full) \ x(journal_pin_full) \ x(journal_stuck) \ @@ -238,6 +243,7 @@ struct journal { /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; + u64 seq_write_started; /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; u64 flushed_seq_ondisk; From 7606fb4d26e0684d40e40ea070a30af901e5bbbd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 21:26:27 -0500 Subject: [PATCH 049/180] bcachefs: Ignore backpointers to stripes in ec_stripe_update_extents() Prep work for stripe backpointers: this path previously would get very confused at being asked to process (remove redundant replicas) stripes. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d2a5e76e6479..1aa56d28de33 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1380,8 +1380,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b if (bp_k.k->type != KEY_TYPE_backpointer) continue; + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); + if (bp.v->btree_id == BTREE_ID_stripes) + continue; + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, - bkey_s_c_to_backpointer(bp_k), &last_flushed); + bp, &last_flushed); })); bch2_bkey_buf_exit(&last_flushed, c); From b7f648e2ec3c3a06fc1397b2f3e88480da56b7ad Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 17:12:47 -0500 Subject: [PATCH 050/180] bcachefs: Add comment explaining why asserts in invalidate_one_bucket() are impossible Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 43c29b0d2d20..a35455802280 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2093,6 +2093,13 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) goto out; + /* + * Impossible since alloc_lru_idx_read() only returns nonzero if the + * bucket is supposed to be on the cached bucket LRU (i.e. + * BCH_DATA_cached) + * + * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 + */ BUG_ON(a->v.data_type != BCH_DATA_cached); BUG_ON(a->v.dirty_sectors); From fd49882f124a6315f0b0204abe2774f8b34a694b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 Jan 2025 01:22:42 -0500 Subject: [PATCH 051/180] bcachefs: Add time_stat for btree writes We have other metadata IO types covered, this was missing. Note: this includes the time until completion, i.e. including parent pointer update. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/btree_io.c | 16 +++++++++++----- fs/bcachefs/btree_io.h | 1 + 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 9791bfe08895..e8f4999806b6 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -444,6 +444,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_node_sort) \ x(btree_node_read) \ x(btree_node_read_done) \ + x(btree_node_write) \ x(btree_interior_update_foreground) \ x(btree_interior_update_total) \ x(btree_gc) \ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 91c624db2958..18413b4f22a3 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -2016,7 +2016,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, bch2_journal_pin_drop(&c->journal, &w->journal); } -static void __btree_node_write_done(struct bch_fs *c, struct btree *b) +static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) { struct btree_write *w = btree_prev_write(b); unsigned long old, new; @@ -2024,6 +2024,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) bch2_btree_complete_write(c, b, w); + if (start_time) + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); + old = READ_ONCE(b->flags); do { new = old; @@ -2054,7 +2057,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); } -static void btree_node_write_done(struct bch_fs *c, struct btree *b) +static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) { struct btree_trans *trans = bch2_trans_get(c); @@ -2062,7 +2065,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) /* we don't need transaction context anymore after we got the lock. */ bch2_trans_put(trans); - __btree_node_write_done(c, b); + __btree_node_write_done(c, b, start_time); six_unlock_read(&b->c.lock); } @@ -2072,6 +2075,7 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; + u64 start_time = wbio->start_time; int ret = 0; btree_bounce_free(c, @@ -2104,7 +2108,7 @@ static void btree_node_write_work(struct work_struct *work) } out: bio_put(&wbio->wbio.bio); - btree_node_write_done(c, b); + btree_node_write_done(c, b, start_time); return; err: set_btree_node_noevict(b); @@ -2208,6 +2212,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) bool validate_before_checksum = false; enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; void *data; + u64 start_time = local_clock(); int ret; if (flags & BTREE_WRITE_ALREADY_STARTED) @@ -2416,6 +2421,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) wbio->data = data; wbio->data_bytes = bytes; wbio->sector_offset = b->written; + wbio->start_time = start_time; wbio->wbio.c = c; wbio->wbio.used_mempool = used_mempool; wbio->wbio.first_btree_write = !b->written; @@ -2443,7 +2449,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) b->written += sectors_to_write; nowrite: btree_bounce_free(c, bytes, used_mempool, data); - __btree_node_write_done(c, b); + __btree_node_write_done(c, b, 0); } /* diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 75ead3815d67..dbf76d22c660 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -52,6 +52,7 @@ struct btree_write_bio { void *data; unsigned data_bytes; unsigned sector_offset; + u64 start_time; struct bch_write_bio wbio; }; From 34a493089af2f2f773c0c802e84b0a493115cd50 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 16:58:34 -0500 Subject: [PATCH 052/180] bcachefs: bch2_bkey_ptr_data_type() now correctly returns cached for cached ptrs Necessary for adding backpointers for cached pointers. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.h | 2 +- fs/bcachefs/backpointers.h | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index de25ba4ee94b..c556ccaffe89 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, if (a.stripe) return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; if (bch2_bucket_sectors_dirty(a)) - return data_type; + return bucket_data_type(data_type); if (a.cached_sectors) return BCH_DATA_cached; if (BCH_ALLOC_V4_NEED_DISCARD(&a)) diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 5c6a17c21769..7786731d4ada 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, return BCH_DATA_btree; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; + if (p.has_ec) + return BCH_DATA_stripe; + if (p.ptr.cached) + return BCH_DATA_cached; + else + return BCH_DATA_user; case KEY_TYPE_stripe: { const struct bch_extent_ptr *ptr = &entry->ptr; struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); From f7f9be023860570fa290b2ff7a8a2da1d2b47739 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 28 Jan 2025 10:32:47 +0100 Subject: [PATCH 053/180] bcachefs: bch2_blacklist_entries_gc cleanup Use an eytzinger0_for_each() loop here. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_seq_blacklist.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 1f25c111c54c..e463d2d95359 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -231,15 +231,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c) struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; BUG_ON(nr != t->nr); - unsigned i; - for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); - src < bl->start + nr; - src++, i = eytzinger0_next(i, nr)) { + src = bl->start; + eytzinger0_for_each(i, nr) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) *dst++ = *src; + src++; } unsigned new_nr = dst - bl->start; From d54b82ecc415ae2e563e6087acaf0e3c5d24daf5 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 28 Jan 2025 01:39:23 +0100 Subject: [PATCH 054/180] bcachefs: EYTZINGER_DEBUG fix When EYTZINGER_DEBUG is defined, needs to be included. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 0541192d7bc0..5f2f96b1295e 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -6,6 +6,7 @@ #include #ifdef EYTZINGER_DEBUG +#include #define EYTZINGER_BUG_ON(cond) BUG_ON(cond) #else #define EYTZINGER_BUG_ON(cond) From 217ad1d7c707e310f3f4b7eeb9b7ea48f4cf3821 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 26 Nov 2024 12:12:36 +0100 Subject: [PATCH 055/180] bcachefs: eytzinger self tests: loop cleanups The iterator variable of eytzinger0_for_each() loops has been changed to be locally scoped at some point, so remove variables defined outside the loop that are now unused. In addition and for clarity, use a different variable inside those loops where an outside variable would be shadowed. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index da2cd11b3025..52458462d577 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -473,10 +473,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats u64 last_q = 0; prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + eytzinger0_for_each(j, NR_QUANTILES) { + bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1; - u64 q = max(quantiles->entries[i].m, last_q); + u64 q = max(quantiles->entries[j].m, last_q); prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); if (is_last) prt_newline(out); @@ -707,7 +707,7 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) #if 0 void eytzinger1_test(void) { - unsigned inorder, eytz, size; + unsigned inorder, size; pr_info("1 based eytzinger test:"); @@ -740,7 +740,7 @@ void eytzinger1_test(void) void eytzinger0_test(void) { - unsigned inorder, eytz, size; + unsigned inorder, size; pr_info("0 based eytzinger test:"); @@ -770,7 +770,7 @@ void eytzinger0_test(void) } } -static inline int cmp_u16(const void *_l, const void *_r, size_t size) +static inline int cmp_u16(const void *_l, const void *_r) { const u16 *l = _l, *r = _r; @@ -793,8 +793,8 @@ static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) c2 = test_array[i]; if (c1 != c2) { - eytzinger0_for_each(i, nr) - pr_info("[%3u] = %12u", i, test_array[i]); + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u", j, test_array[j]); pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", i, r, c1, c2); } @@ -812,9 +812,9 @@ void eytzinger0_find_test(void) eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); /* verify array is sorted correctly: */ - eytzinger0_for_each(i, nr) - BUG_ON(i != eytzinger0_last(nr) && - test_array[i] > test_array[eytzinger0_next(i, nr)]); + eytzinger0_for_each(j, nr) + BUG_ON(j != eytzinger0_last(nr) && + test_array[j] > test_array[eytzinger0_next(j, nr)]); for (i = 0; i < U16_MAX; i += 1 << 12) eytzinger0_find_test_val(test_array, nr, i); From 0ede49212a840a9e5e41722f819b8b9a6f69ccda Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 26 Nov 2024 21:55:49 +0100 Subject: [PATCH 056/180] bcachefs: eytzinger self tests: missing newline termination pr_info() format strings need to be newline terminated. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 52458462d577..8120e9d2667c 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -709,7 +709,7 @@ void eytzinger1_test(void) { unsigned inorder, size; - pr_info("1 based eytzinger test:"); + pr_info("1 based eytzinger test:\n"); for (size = 2; size < 65536; @@ -717,7 +717,7 @@ void eytzinger1_test(void) unsigned extra = eytzinger1_extra(size); if (!(size % 4096)) - pr_info("tree size %u", size); + pr_info("tree size %u\n", size); BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); @@ -742,7 +742,7 @@ void eytzinger0_test(void) unsigned inorder, size; - pr_info("0 based eytzinger test:"); + pr_info("0 based eytzinger test:\n"); for (size = 1; size < 65536; @@ -750,7 +750,7 @@ void eytzinger0_test(void) unsigned extra = eytzinger0_extra(size); if (!(size % 4096)) - pr_info("tree size %u", size); + pr_info("tree size %u\n", size); BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); @@ -794,8 +794,8 @@ static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) if (c1 != c2) { eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u", j, test_array[j]); - pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i\n", i, r, c1, c2); } } @@ -806,7 +806,7 @@ void eytzinger0_find_test(void) u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); for (nr = 1; nr < allocated; nr++) { - pr_info("testing %u elems", nr); + pr_info("testing %u elems\n", nr); get_random_bytes(test_array, nr * sizeof(test_array[0])); eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); From 0766f5599cbba9cb567c8ed2d1da3bfc65550791 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 26 Nov 2024 23:33:55 +0100 Subject: [PATCH 057/180] bcachefs: eytzinger self tests: fix cmp_u16 typo Fix an obvious typo in cmp_u16(). Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 8120e9d2667c..3a69e3409e89 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -774,7 +774,7 @@ static inline int cmp_u16(const void *_l, const void *_r) { const u16 *l = _l, *r = _r; - return (*l > *r) - (*r - *l); + return (*l > *r) - (*r > *l); } static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) From ec70103f9b8a2d28644fcf0c027421cfa15553b1 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 26 Jan 2025 11:28:59 +0100 Subject: [PATCH 058/180] bcachefs: eytzinger[01]_test improvement In eytzinger[01]_test(), make sure that eytzinger[01]_for_each() iterates over all array elements. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 3a69e3409e89..2af77c410179 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -734,6 +734,7 @@ void eytzinger1_test(void) inorder++; } + BUG_ON(inorder - 1 != size); } } @@ -767,6 +768,7 @@ void eytzinger0_test(void) inorder++; } + BUG_ON(inorder != size); } } From e8a0966ffaa6aae8d64a0159a47541e32f2f587a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 26 Jan 2025 11:22:33 +0100 Subject: [PATCH 059/180] bcachefs: eytzinger0_find_test improvement In eytzinger0_find_test(), remember the smallest element seen so far instead of comparing adjacent array elements. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 2af77c410179..4114e5264965 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -808,15 +808,18 @@ void eytzinger0_find_test(void) u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); for (nr = 1; nr < allocated; nr++) { + u16 prev = 0; + pr_info("testing %u elems\n", nr); get_random_bytes(test_array, nr * sizeof(test_array[0])); eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); /* verify array is sorted correctly: */ - eytzinger0_for_each(j, nr) - BUG_ON(j != eytzinger0_last(nr) && - test_array[j] > test_array[eytzinger0_next(j, nr)]); + eytzinger0_for_each(j, nr) { + BUG_ON(test_array[j] < prev); + prev = test_array[j]; + } for (i = 0; i < U16_MAX; i += 1 << 12) eytzinger0_find_test_val(test_array, nr, i); From dc5ceaaad81a724e7090d8709290fae36e3f2a5d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 27 Jan 2025 17:26:05 +0100 Subject: [PATCH 060/180] bcachefs: add eytzinger0_for_each_prev Add an eytzinger0_for_each_prev() macro for iterating through an eytzinger array in reverse. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 5 +++++ fs/bcachefs/util.c | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 5f2f96b1295e..99edae4bb995 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -244,6 +244,11 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) +#define eytzinger0_for_each_prev(_i, _size) \ + for (unsigned (_i) = eytzinger0_last((_size)); \ + (_i) != -1; \ + (_i) = eytzinger0_prev((_i), (_size))) + /* return greatest node <= @search, or -1 if not found */ static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 4114e5264965..ebe3b5b1e615 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -769,6 +769,15 @@ void eytzinger0_test(void) inorder++; } BUG_ON(inorder != size); + + inorder = size - 1; + eytzinger0_for_each_prev(eytz, size) { + BUG_ON(eytz != eytzinger0_first(size) && + eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz); + + inorder--; + } + BUG_ON(inorder != -1); } } From c722b818a2f8c43d75efeba8005af5f17dc535f0 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 26 Jan 2025 17:57:06 +0100 Subject: [PATCH 061/180] bcachefs: improve eytzinger0_find_le self test Rename eytzinger0_find_test_val() to eytzinger0_find_test_le() and add a new eytzinger0_find_test_val() wrapper that calls it. We have already established that the array is sorted in eytzinger order, so we can use the eytzinger iterator functions and check the boundary conditions to verify the result of eytzinger0_find_le(). Only scan the entire array if we get an incorrect result. When we need to scan, use eytzinger0_for_each_prev() so that we'll stop at the highest matching element in the array in case there are duplicates; going through the array linearly wouldn't give us that. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index ebe3b5b1e615..d2f7ffcc4fd6 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -788,29 +788,48 @@ static inline int cmp_u16(const void *_l, const void *_r) return (*l > *r) - (*r > *l); } -static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) { - int i, c1 = -1, c2 = -1; - ssize_t r; + int r, s; + bool bad; r = eytzinger0_find_le(test_array, nr, sizeof(test_array[0]), cmp_u16, &search); - if (r >= 0) - c1 = test_array[r]; + if (r >= 0) { + if (test_array[r] > search) { + bad = true; + } else { + s = eytzinger0_next(r, nr); + bad = s >= 0 && test_array[s] <= search; + } + } else { + s = eytzinger0_last(nr); + bad = s >= 0 && test_array[s] <= search; + } - for (i = 0; i < nr; i++) - if (test_array[i] <= search && test_array[i] > c2) - c2 = test_array[i]; + if (bad) { + s = -1; + eytzinger0_for_each_prev(j, nr) { + if (test_array[j] <= search) { + s = j; + break; + } + } - if (c1 != c2) { eytzinger0_for_each(j, nr) pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i\n", - i, r, c1, c2); + pr_info("find_le(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); } } +static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +{ + eytzinger0_find_test_le(test_array, nr, search); +} + void eytzinger0_find_test(void) { unsigned i, nr, allocated = 1 << 12; From d148d804f2cc5f932ef840853958a36b77346a54 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 28 Jan 2025 10:56:04 +0100 Subject: [PATCH 062/180] bcachefs: convert eytzinger0_find_le to be 1-based eytzinger0_find_le() is also easy to concert to 1-based eytzinger (but see the next commit). Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 99edae4bb995..08256fcaeeb7 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -253,27 +253,27 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - unsigned i, n = 0; + void *base1 = base - size; + unsigned i, n = 1; if (!nr) return -1; do { i = n; - n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); - } while (n < nr); + n = eytzinger1_child(i, cmp(base1 + i * size, search) <= 0); + } while (n <= nr); - if (n & 1) { + if (!(n & 1)) { /* * @i was greater than @search, return previous node: * * if @i was leftmost/smallest element, - * eytzinger0_prev(eytzinger0_first())) returns -1, as expected + * eytzinger1_prev(eytzinger1_first())) returns 0, as expected */ - return eytzinger0_prev(i, nr); - } else { - return i; + i = eytzinger1_prev(i, nr); } + return i - 1; } static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, From d384dada0ea999b11a3dd964047b7c69d15a8bd3 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 27 Jan 2025 14:33:20 +0100 Subject: [PATCH 063/180] bcachefs: simplify eytzinger0_find_le Replace the over-complicated implementation of eytzinger0_find_le() by an equivalent, simpler version. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 08256fcaeeb7..a530dbcde476 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -254,26 +254,12 @@ static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { void *base1 = base - size; - unsigned i, n = 1; + unsigned n = 1; - if (!nr) - return -1; - - do { - i = n; - n = eytzinger1_child(i, cmp(base1 + i * size, search) <= 0); - } while (n <= nr); - - if (!(n & 1)) { - /* - * @i was greater than @search, return previous node: - * - * if @i was leftmost/smallest element, - * eytzinger1_prev(eytzinger1_first())) returns 0, as expected - */ - i = eytzinger1_prev(i, nr); - } - return i - 1; + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); + n >>= __ffs(n) + 1; + return n - 1; } static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, From d7cd33f7efbb91893bb20aa2baae4f8c37dd035a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 27 Jan 2025 17:05:21 +0100 Subject: [PATCH 064/180] bcachefs: add eytzinger0_find_gt self test Add an eytzinger0_find_gt() self test similar to eytzinger0_find_le(). Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index d2f7ffcc4fd6..9c6e5d7122b4 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -825,9 +825,47 @@ static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) } } +static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) +{ + int r, s; + bool bad; + + r = eytzinger0_find_gt(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) { + if (test_array[r] <= search) { + bad = true; + } else { + s = eytzinger0_prev(r, nr); + bad = s >= 0 && test_array[s] > search; + } + } else { + s = eytzinger0_first(nr); + bad = s >= 0 && test_array[s] > search; + } + + if (bad) { + s = -1; + eytzinger0_for_each(j, nr) { + if (test_array[j] > search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_gt(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); + } +} + static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) { eytzinger0_find_test_le(test_array, nr, search); + eytzinger0_find_test_gt(test_array, nr, search); } void eytzinger0_find_test(void) From 2182f29545f385df9aa4861f9e08d0d378c26c9f Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 27 Jan 2025 17:52:39 +0100 Subject: [PATCH 065/180] bcachefs: implement eytzinger0_find_gt directly Instead of implementing eytzinger0_find_gt() in terms of eytzinger0_find_le() and adjusting the result, implement it directly. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index a530dbcde476..568a04b16d09 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -262,20 +262,17 @@ static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, return n - 1; } +/* return smallest node > @search, or -1 if not found */ static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + void *base1 = base - size; + unsigned n = 1; - /* - * if eytitzinger0_find_le() returned -1 - no element was <= search - we - * want to return the first element; next/prev identities mean this work - * as expected - * - * similarly if find_le() returns last element, we should return -1; - * identities mean this all works out: - */ - return eytzinger0_next(idx, nr); + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); + n >>= __ffs(n + 1) + 1; + return n - 1; } static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, From 11223d0e7b091b11e0e533850c1007e8fc797c68 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 27 Jan 2025 17:52:39 +0100 Subject: [PATCH 066/180] bcachefs: implement eytzinger0_find_ge directly Implement eytzinger0_find_ge() directly instead of implementing it in terms of eytzinger0_find_le() and adjusting the result. This turns eytzinger0_find_ge() into a minimum search, so when there are duplicate elements, the result of eytzinger0_find_ge() will now always point at the first matching element. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 568a04b16d09..e3713b7b4c27 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -275,15 +275,17 @@ static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, return n - 1; } +/* return smallest node >= @search, or -1 if not found */ static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + void *base1 = base - size; + unsigned n = 1; - if (idx < nr && !cmp(base + idx * size, search)) - return idx; - - return eytzinger0_next(idx, nr); + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); + n >>= __ffs(n + 1) + 1; + return n - 1; } #define eytzinger0_find(base, nr, size, _cmp, search) \ From 63ce189b00c37a6fd0297d45ddede5442adb0a28 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 27 Jan 2025 17:15:36 +0100 Subject: [PATCH 067/180] bcachefs: add eytzinger0_find_ge self test Add an eytzinger0_find_ge() self test similar to eytzinger0_find_gt(). Note that this test requires eytzinger0_find_ge() to return the first matching element in the array in case of duplicates. To prevent bisection errors, we only add this test after strenghening the original implementation (see the previous commit). Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 9c6e5d7122b4..14686ff32003 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -862,10 +862,48 @@ static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) } } +static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) +{ + int r, s; + bool bad; + + r = eytzinger0_find_ge(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) { + if (test_array[r] < search) { + bad = true; + } else { + s = eytzinger0_prev(r, nr); + bad = s >= 0 && test_array[s] >= search; + } + } else { + s = eytzinger0_first(nr); + bad = s >= 0 && test_array[s] >= search; + } + + if (bad) { + s = -1; + eytzinger0_for_each(j, nr) { + if (test_array[j] >= search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_ge(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); + } +} + static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) { eytzinger0_find_test_le(test_array, nr, search); eytzinger0_find_test_gt(test_array, nr, search); + eytzinger0_find_test_ge(test_array, nr, search); } void eytzinger0_find_test(void) From 956032edd25d971da2820754242eaa7a925a8215 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 1 Feb 2025 13:55:46 +0100 Subject: [PATCH 068/180] bcachefs: Add eytzinger0_find self test Function eytzinger0_find() isn't currently covered, so add a self test. We can rely on eytzinger0_find_le() here because it is being tested independently. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 14686ff32003..525734528f35 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -899,11 +899,40 @@ static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) } } +static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) +{ + unsigned r; + int s; + bool bad; + + r = eytzinger0_find(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + + if (r < nr) { + bad = test_array[r] != search; + } else { + s = eytzinger0_find_le(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + bad = s >= 0 && test_array[s] == search; + } + + if (bad) { + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find(%12u) = %3i is incorrect\n", + search, r); + BUG(); + } +} + static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) { eytzinger0_find_test_le(test_array, nr, search); eytzinger0_find_test_gt(test_array, nr, search); eytzinger0_find_test_ge(test_array, nr, search); + eytzinger0_find_test_eq(test_array, nr, search); } void eytzinger0_find_test(void) From 3849bcab4d3f0cf1aee56bdfc7bcae7b40b11657 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 28 Jan 2025 10:56:37 +0100 Subject: [PATCH 069/180] bcachefs: convert eytzinger0_find to be 1-based Several of the algorithms on eytzinger trees are implemented in terms of the eytzinger0 primitives. However, those algorithms can just as easily be expressed in terms of the eytzinger1 primitives, and that leads to better and easier to understand code. Start by converting eytzinger0_find(). Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index e3713b7b4c27..90cd5648b177 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -290,17 +290,17 @@ static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ - void *_base = (base); \ + size_t _size = (size); \ + void *_base1 = (void *)(base) - _size; \ const void *_search = (search); \ size_t _nr = (nr); \ - size_t _size = (size); \ - size_t _i = 0; \ + size_t _i = 1; \ int _res; \ \ - while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size))) \ - _i = eytzinger0_child(_i, _res > 0); \ - _i; \ + while (_i <= _nr && \ + (_res = _cmp(_search, _base1 + _i * _size))) \ + _i = eytzinger1_child(_i, _res > 0); \ + _i - 1; \ }) void eytzinger0_sort_r(void *, size_t, size_t, From 3ff0dd28d61e3b3ca378b897f98f0ea6810bf822 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 27 Nov 2024 13:26:10 +0100 Subject: [PATCH 070/180] bcachefs: convert eytzinger sort to be 1-based (1) In this first step, convert the eytzinger sort functions to use 1-based primitives. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.c | 48 +++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 2eaffe37b5e7..4fe02c93bfb3 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -148,28 +148,28 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr return cmp(a, b, priv); } -static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, +static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, cmp_r_func_t cmp_func, const void *priv, size_t l, size_t r) { - return do_cmp(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, + return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, + base1 + inorder_to_eytzinger1(r, n) * size, cmp_func, priv); } -static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, +static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, swap_r_func_t swap_func, const void *priv, size_t l, size_t r) { - do_swap(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, + do_swap(base1 + inorder_to_eytzinger1(l, n) * size, + base1 + inorder_to_eytzinger1(r, n) * size, size, swap_func, priv); } -void eytzinger0_sort_r(void *base, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) +static void eytzinger1_sort_r(void *base1, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) { int i, j, k; @@ -178,9 +178,9 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, swap_func = NULL; if (!swap_func) { - if (is_aligned(base, size, 8)) + if (is_aligned(base1, size, 8)) swap_func = SWAP_WORDS_64; - else if (is_aligned(base, size, 4)) + else if (is_aligned(base1, size, 4)) swap_func = SWAP_WORDS_32; else swap_func = SWAP_BYTES; @@ -190,47 +190,57 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, for (i = n / 2 - 1; i >= 0; --i) { /* Find the sift-down path all the way to the leaves. */ for (j = i; k = j * 2 + 1, k + 1 < n;) - j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k + 1, k + 2) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ if (j * 2 + 2 == n) j = j * 2 + 1; /* Backtrack to the correct location. */ - while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) + while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i + 1, j + 1) >= 0) j = (j - 1) / 2; /* Shift the element into its correct place. */ for (k = j; j != i;) { j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + eytzinger1_do_swap(base1, n, size, swap_func, priv, j + 1, k + 1); } } /* sort */ for (i = n - 1; i > 0; --i) { - eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i + 1); /* Find the sift-down path all the way to the leaves. */ for (j = 0; k = j * 2 + 1, k + 1 < i;) - j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k + 1, k + 2) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ if (j * 2 + 2 == i) j = j * 2 + 1; /* Backtrack to the correct location. */ - while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) + while (j && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j + 1) >= 0) j = (j - 1) / 2; /* Shift the element into its correct place. */ for (k = j; j;) { j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + eytzinger1_do_swap(base1, n, size, swap_func, priv, j + 1, k + 1); } } } +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + void *base1 = base - size; + + return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); +} + void eytzinger0_sort(void *base, size_t n, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) From 68eb4c5fea4146e060a32d6434009dfae353709c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 27 Jan 2025 20:54:52 +0100 Subject: [PATCH 071/180] bcachefs: convert eytzinger sort to be 1-based (2) In this second step, transform the eytzinger indexes i, j, and k in eytzinger1_sort_r() from 0-based to 1-based. This step looks a bit messy, but the resulting code is slightly better. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.c | 42 ++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 4fe02c93bfb3..0e742555cb0a 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -171,7 +171,7 @@ static void eytzinger1_sort_r(void *base1, size_t n, size_t size, swap_r_func_t swap_func, const void *priv) { - int i, j, k; + unsigned i, j, k; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) @@ -187,46 +187,46 @@ static void eytzinger1_sort_r(void *base1, size_t n, size_t size, } /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { + for (i = n / 2; i >= 1; --i) { /* Find the sift-down path all the way to the leaves. */ - for (j = i; k = j * 2 + 1, k + 1 < n;) - j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k + 1, k + 2) > 0 ? k : k + 1; + for (j = i; k = j * 2, k < n;) + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ - if (j * 2 + 2 == n) - j = j * 2 + 1; + if (j * 2 == n) + j *= 2; /* Backtrack to the correct location. */ - while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i + 1, j + 1) >= 0) - j = (j - 1) / 2; + while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) + j /= 2; /* Shift the element into its correct place. */ for (k = j; j != i;) { - j = (j - 1) / 2; - eytzinger1_do_swap(base1, n, size, swap_func, priv, j + 1, k + 1); + j /= 2; + eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); } } /* sort */ - for (i = n - 1; i > 0; --i) { - eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i + 1); + for (i = n; i > 1; --i) { + eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); /* Find the sift-down path all the way to the leaves. */ - for (j = 0; k = j * 2 + 1, k + 1 < i;) - j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k + 1, k + 2) > 0 ? k : k + 1; + for (j = 1; k = j * 2, k + 1 < i;) + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ - if (j * 2 + 2 == i) - j = j * 2 + 1; + if (j * 2 + 1 == i) + j *= 2; /* Backtrack to the correct location. */ - while (j && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j + 1) >= 0) - j = (j - 1) / 2; + while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) + j /= 2; /* Shift the element into its correct place. */ - for (k = j; j;) { - j = (j - 1) / 2; - eytzinger1_do_swap(base1, n, size, swap_func, priv, j + 1, k + 1); + for (k = j; j > 1;) { + j /= 2; + eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); } } } From f27614652cd33184fb8a8464c1b0b893d350b33d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 28 Jan 2025 18:24:15 +0100 Subject: [PATCH 072/180] bcachefs: eytzinger1_{next,prev} cleanup The eytzinger code was previously relying on the following wrap-around properties and their "eytzinger0" equivalents: eytzinger1_prev(0, size) == eytzinger1_last(size) eytzinger1_next(0, size) == eytzinger1_first(size) However, these properties are no longer relied upon and no longer necessary, so remove the corresponding asserts and forbid the use of eytzinger1_prev(0, size) and eytzinger1_next(0, size). This allows to further simplify the code in eytzinger1_next() and eytzinger1_prev(): where the left shifting happens, eytzinger1_next() is trying to move i to the lowest child on the left, which is equivalent to doubling i until the next doubling would cause it to be greater than size. This is implemented by shifting i to the left so that the most significant bits align and then shifting i to the right by one if the result is greater than size. Likewise, eytzinger1_prev() is trying to move to the lowest child on the right; the same applies here. The 1-offset in (size - 1) in eytzinger1_next() isn't needed at all, but the equivalent offset in eytzinger1_prev() is surprisingly needed to preserve the 'eytzinger1_prev(0, size) == eytzinger1_last(size)' property. However, since we no longer support that property, we can get rid of these offsets as well. This saves one addition in each function and makes the code less confusing. Signed-off-by: Andreas Gruenbacher Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 18 ++++-------------- fs/bcachefs/util.c | 12 ------------ 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 90cd5648b177..643c1f716061 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -57,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size) return rounddown_pow_of_two(size + 1) - 1; } -/* - * eytzinger1_next() and eytzinger1_prev() have the nice properties that - * - * eytzinger1_next(0) == eytzinger1_first()) - * eytzinger1_prev(0) == eytzinger1_last()) - * - * eytzinger1_prev(eytzinger1_first()) == 0 - * eytzinger1_next(eytzinger1_last()) == 0 - */ - static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EYTZINGER_BUG_ON(i == 0 || i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); - i <<= __fls(size + 1) - __fls(i); + i <<= __fls(size) - __fls(i); i >>= i > size; } else { i >>= ffz(i) + 1; @@ -85,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EYTZINGER_BUG_ON(i == 0 || i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; - i <<= __fls(size + 1) - __fls(i); + i <<= __fls(size) - __fls(i); i -= 1; i >>= i > size; } else { diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 525734528f35..a7edbcca1a84 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -719,12 +719,6 @@ void eytzinger1_test(void) if (!(size % 4096)) pr_info("tree size %u\n", size); - BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); - BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); - - BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); - BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); - inorder = 1; eytzinger1_for_each(eytz, size) { BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); @@ -753,12 +747,6 @@ void eytzinger0_test(void) if (!(size % 4096)) pr_info("tree size %u\n", size); - BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); - BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); - - BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); - BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); - inorder = 0; eytzinger0_for_each(eytz, size) { BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); From 3faa4647a0c3fd0e27e966a8c72ab9863014d518 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 11:55:33 -0500 Subject: [PATCH 073/180] bcachefs: metadata_target is not an inode option This option only applies filesystem wide. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9d397fc2a1f0..071a92ec8a14 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -197,7 +197,7 @@ enum fsck_err_opts { BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ "(target)", "Device or label for metadata writes") \ From 1ccbcd320577271c85d9a5bfbdd3394cb9baadb3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 17:04:08 -0500 Subject: [PATCH 074/180] bcachefs: bch2_write_op_error() now prints info about data update A user has been seeing the "error verifying existing checksum while rewriting existing data (memory corruption?)" error. This generally indicates a hardware issue (and that may be the case here), but it might also indicate a bug, in which case we need more information to look for patterns. Reported-by: Roland Vet Signed-off-by: Kent Overstreet --- fs/bcachefs/compress.c | 8 ++-- fs/bcachefs/error.c | 6 +++ fs/bcachefs/error.h | 1 + fs/bcachefs/io_write.c | 92 ++++++++++++++++++++++++++++-------------- fs/bcachefs/io_write.h | 8 +++- 5 files changed, 80 insertions(+), 35 deletions(-) diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 114bf2f3879f..31467f77930f 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -271,8 +271,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || crc->compressed_size << 9 > c->opts.encoded_extent_max) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: extent too big"); + bch2_write_op_error(&buf, op, op->pos.offset, + "extent too big to decompress"); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); return -EIO; @@ -283,8 +283,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, if (__bio_uncompress(c, bio, data.b, *crc)) { if (!c->opts.no_data_io) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: decompression error"); + bch2_write_op_error(&buf, op, op->pos.offset, + "decompression error"); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index c8fc58fab958..3f93a5a6bbfa 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -580,3 +580,9 @@ int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printb prt_printf(out, " offset %llu: ", pos.offset << 8); return 0; } + +void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, + struct bpos pos) +{ + bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 76da0e88cee8..b3cc69f29fd9 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -243,5 +243,6 @@ int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subv void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); +void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos); #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 076e39474610..738bdbfbdb14 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -396,29 +396,61 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ -static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, - u64 offset) +void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64 offset, const char *fmt, ...) { - bch2_inum_offset_err_msg(op->c, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_move ? "(internal move)" : ""); + if (op->subvol) + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9)); + else { + struct bpos pos = op->pos; + pos.offset = offset; + lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); + } + + prt_str(out, "write error: "); + + va_list args; + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + + if (op->flags & BCH_WRITE_move) { + struct data_update *u = container_of(op, struct data_update, op); + + prt_printf(out, "\n from internal move "); + bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k)); + } } -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64 offset, + const char *fmt, ...) { - __bch2_write_op_error(out, op, op->pos.offset); -} + if (op->subvol) + bch2_inum_offset_err_msg(op->c, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + else { + struct bpos pos = op->pos; + pos.offset = offset; + bch2_inum_snap_offset_err_msg(op->c, out, pos); + } -static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_write_op *op, u64 offset) -{ - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_move ? "(internal move)" : ""); + prt_str(out, "write error: "); + + va_list args; + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + + if (op->flags & BCH_WRITE_move) { + struct data_update *u = container_of(op, struct data_update, op); + + prt_printf(out, "\n from internal move "); + bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k)); + } } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -561,8 +593,8 @@ static void __bch2_write_index(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1114,8 +1146,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, csum_err: { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + bch2_write_op_error(&buf, op, op->pos.offset, + "error verifying existing checksum while rewriting existing data (memory corruption?)"); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1211,8 +1243,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1379,8 +1411,8 @@ static void bch2_nocow_write(struct bch_write_op *op) if (ret) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); + bch2_write_op_error(&buf, op, op->pos.offset, + "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); op->error = ret; @@ -1502,8 +1534,8 @@ static void __bch2_write(struct bch_write_op *op) if (unlikely(ret < 0)) { if (!(op->flags & BCH_WRITE_alloc_nowait)) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); + bch2_write_op_error(&buf, op, op->pos.offset, + "%s(): %s", __func__, bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1634,8 +1666,8 @@ CLOSURE_CALLBACK(bch2_write) if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "misaligned write"); + bch2_write_op_error(&buf, op, op->pos.offset, + "misaligned write"); printbuf_exit(&buf); op->error = -EIO; goto err; diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 02cca52be0bd..bf942566a8eb 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -20,7 +20,13 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); +__printf(5, 6) +void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64, const char *, ...); + +__printf(4, 5) +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64, + const char *, ...); #define BCH_WRITE_FLAGS() \ x(alloc_nowait) \ From cb87f623c1ef5320431989c5215f9d46f2bc2a6f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Feb 2025 09:47:39 -0500 Subject: [PATCH 075/180] bcachefs: minor journal errcode cleanup Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 2 +- fs/bcachefs/journal_io.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 40d3ad5a1e5c..8d4f3bfaa228 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -981,7 +981,7 @@ int bch2_journal_meta(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) - return -EROFS; + return -BCH_ERR_erofs_no_writes; int ret = __bch2_journal_meta(j); bch2_write_ref_put(c, BCH_WRITE_REF_journal); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 61f71e7baff2..7d59ccc07315 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1515,7 +1515,7 @@ static void __journal_write_alloc(struct journal *j, * @j: journal object * @w: journal buf (entry to be written) * - * Returns: 0 on success, or -EROFS on failure + * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure */ static int journal_write_alloc(struct journal *j, struct journal_buf *w) { @@ -1624,8 +1624,7 @@ static CLOSURE_CALLBACK(journal_write_done) } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); - if (bch2_mark_replicas(c, &replicas.e)) - err = -EIO; + err = bch2_mark_replicas(c, &replicas.e); } if (err) @@ -1988,7 +1987,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * * write anything at all. */ if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) - return -EIO; + return error; if (error || w->noflush || From e1304967078c17af2520402e3e53a48c1e001072 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 18:37:50 -0500 Subject: [PATCH 076/180] bcachefs: bch2_lru_change() checks for no-op Minor cleanup, no reason for the caller to have to this. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 32 +++++++++++++------------------- fs/bcachefs/lru.c | 6 +++--- fs/bcachefs/lru.h | 11 ++++++++++- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index a35455802280..e1061524bdf5 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -889,26 +889,20 @@ int bch2_trigger_alloc(struct btree_trans *trans, !new_a->io_time[READ]) new_a->io_time[READ] = bch2_current_io_time(c, READ); - u64 old_lru = alloc_lru_idx_read(*old_a); - u64 new_lru = alloc_lru_idx_read(*new_a); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, new.k->p.inode, - bucket_to_u64(new.k->p), - old_lru, new_lru); - if (ret) - goto err; - } + ret = bch2_lru_change(trans, new.k->p.inode, + bucket_to_u64(new.k->p), + alloc_lru_idx_read(*old_a), + alloc_lru_idx_read(*new_a)); + if (ret) + goto err; - old_lru = alloc_lru_idx_fragmentation(*old_a, ca); - new_lru = alloc_lru_idx_fragmentation(*new_a, ca); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, - BCH_LRU_FRAGMENTATION_START, - bucket_to_u64(new.k->p), - old_lru, new_lru); - if (ret) - goto err; - } + ret = bch2_lru_change(trans, + BCH_LRU_FRAGMENTATION_START, + bucket_to_u64(new.k->p), + alloc_lru_idx_fragmentation(*old_a, ca), + alloc_lru_idx_fragmentation(*new_a, ca)); + if (ret) + goto err; if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index ce794d55818f..8ec16ae8daa6 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -59,9 +59,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); } -int bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) +int __bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) { if (old_time == new_time) return 0; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index f31a6cf1514c..2facc0758cb3 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -46,7 +46,16 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos); int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); -int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); +int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); + +static inline int bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) +{ + return old_time != new_time + ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time) + : 0; +} struct bkey_buf; int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); From b8e37c1645e96348adcfe48786f6f46930048914 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 18:39:50 -0500 Subject: [PATCH 077/180] bcachefs: s/BCH_LRU_FRAGMENTATION_START/BCH_LRU_BUCKET_FRAGMENTATION/ FRAGMENTATION_START was incorrect, there's currently only one fragmentation LRU (at the end of the reserved bits for LRU type), and we're getting ready to add a stripe fragmentation lru - so give it a better name. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 4 ++-- fs/bcachefs/lru.h | 2 +- fs/bcachefs/lru_format.h | 2 +- fs/bcachefs/movinggc.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index e1061524bdf5..87ff50a3cd81 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -897,7 +897,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, goto err; ret = bch2_lru_change(trans, - BCH_LRU_FRAGMENTATION_START, + BCH_LRU_BUCKET_FRAGMENTATION, bucket_to_u64(new.k->p), alloc_lru_idx_fragmentation(*old_a, ca), alloc_lru_idx_fragmentation(*new_a, ca)); @@ -1699,7 +1699,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); if (lru_idx) { - ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, + ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, lru_idx, alloc_k, last_flushed); if (ret) goto err; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 2facc0758cb3..398cc25db459 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -28,7 +28,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; - if (lru_id == BCH_LRU_FRAGMENTATION_START) + if (lru_id == BCH_LRU_BUCKET_FRAGMENTATION) return BCH_LRU_fragmentation; return BCH_LRU_read; } diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h index f372cb3b8cda..353a352d3fb9 100644 --- a/fs/bcachefs/lru_format.h +++ b/fs/bcachefs/lru_format.h @@ -17,7 +17,7 @@ enum bch_lru_type { #undef x }; -#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) +#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1) #define LRU_TIME_BITS 48 #define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 6718dc37c5a3..fa19fc44622c 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -167,8 +167,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, bch2_trans_begin(trans); ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), - lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), 0, k, ({ struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; int ret2 = 0; From 3aff608b86440a7fd1a5486c90124f1963f6d4dc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 18:42:45 -0500 Subject: [PATCH 078/180] bcachefs: decouple bch2_lru_check_set() from alloc btree Pass in the backpointer explicitly, instead of assuming 'referring_k' is an alloc key and calculating it. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 5 ++++- fs/bcachefs/lru.c | 10 +++++----- fs/bcachefs/lru.h | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 87ff50a3cd81..58cdb6a0acf9 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1700,6 +1700,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); if (lru_idx) { ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, + bucket_to_u64(alloc_k.k->p), lru_idx, alloc_k, last_flushed); if (ret) goto err; @@ -1729,7 +1730,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = &a_mut->v; } - ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + a->io_time[READ], alloc_k, last_flushed); if (ret) goto err; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 8ec16ae8daa6..dc6b9a80a8b5 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -78,7 +78,9 @@ static const char * const bch2_lru_types[] = { }; int bch2_lru_check_set(struct btree_trans *trans, - u16 lru_id, u64 time, + u16 lru_id, + u64 dev_bucket, + u64 time, struct bkey_s_c referring_k, struct bkey_buf *last_flushed) { @@ -87,9 +89,7 @@ int bch2_lru_check_set(struct btree_trans *trans, struct btree_iter lru_iter; struct bkey_s_c lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(lru_id, - bucket_to_u64(referring_k.k->p), - time), 0); + lru_pos(lru_id, dev_bucket, time), 0); int ret = bkey_err(lru_k); if (ret) return ret; @@ -104,7 +104,7 @@ int bch2_lru_check_set(struct btree_trans *trans, " %s", bch2_lru_types[lru_type(lru_k)], (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { - ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); + ret = bch2_lru_set(trans, lru_id, dev_bucket, time); if (ret) goto err; } diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 398cc25db459..dea1d75cc9c1 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -58,7 +58,7 @@ static inline int bch2_lru_change(struct btree_trans *trans, } struct bkey_buf; -int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); +int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); int bch2_check_lrus(struct bch_fs *); From bc76ba70d213ea6a84a824c3bd5c4100900b18cc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 18:48:12 -0500 Subject: [PATCH 079/180] bcachefs: Rework bch2_check_lru_key() It's now easier to add new LRU types. Signed-off-by: Kent Overstreet --- fs/bcachefs/lru.c | 77 +++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index dc6b9a80a8b5..98ab8496f29d 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -116,49 +116,67 @@ int bch2_lru_check_set(struct btree_trans *trans, return ret; } +static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) +{ + enum bch_lru_type type = lru_type(lru_k); + + switch (type) { + case BCH_LRU_read: + case BCH_LRU_fragmentation: + return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset)); + default: + BUG(); + } +} + +static u64 bkey_lru_type_idx(struct bch_fs *c, + enum bch_lru_type type, + struct bkey_s_c k) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + + switch (type) { + case BCH_LRU_read: + a = bch2_alloc_to_v4(k, &a_convert); + return alloc_lru_idx_read(*a); + case BCH_LRU_fragmentation: { + a = bch2_alloc_to_v4(k, &a_convert); + + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); + u64 idx = ca + ? alloc_lru_idx_fragmentation(*a, ca) + : 0; + rcu_read_unlock(); + return idx; + } + default: + BUG(); + } +} + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; - enum bch_lru_type type = lru_type(lru_k); - struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); - u64 idx; - int ret; - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos); + struct bbpos bp = lru_pos_to_bp(lru_k); - if (fsck_err_on(!ca, - trans, lru_entry_to_invalid_bucket, - "lru key points to nonexistent device:bucket %llu:%llu", - alloc_pos.inode, alloc_pos.offset)) - return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); + int ret = bkey_err(k); if (ret) goto err; - a = bch2_alloc_to_v4(k, &a_convert); + enum bch_lru_type type = lru_type(lru_k); + u64 idx = bkey_lru_type_idx(c, type, k); - switch (type) { - case BCH_LRU_read: - idx = alloc_lru_idx_read(*a); - break; - case BCH_LRU_fragmentation: - idx = alloc_lru_idx_fragmentation(*a, ca); - break; - } - - if (lru_k.k->type != KEY_TYPE_set || - lru_pos_time(lru_k.k->p) != idx) { + if (lru_pos_time(lru_k.k->p) != idx) { ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); if (ret) goto err; @@ -176,7 +194,6 @@ static int bch2_check_lru_key(struct btree_trans *trans, err: fsck_err: bch2_trans_iter_exit(trans, &iter); - bch2_dev_put(ca); printbuf_exit(&buf2); printbuf_exit(&buf1); return ret; From cc297dfb41834f91cf594893dfff7ebe321190eb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 20:32:37 -0500 Subject: [PATCH 080/180] bcachefs: bch2_trigger_stripe_ptr() no longer uses ec_stripes_heap_lock Introduce per-entry locks, like with struct bucket - the stripes heap is going away. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 6 +++--- fs/bcachefs/buckets.h | 27 --------------------------- fs/bcachefs/buckets_types.h | 27 +++++++++++++++++++++++++++ fs/bcachefs/ec.h | 14 ++++++++++++++ fs/bcachefs/ec_types.h | 5 ++--- 5 files changed, 46 insertions(+), 33 deletions(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 345b117a4a4a..88af61bc799d 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -674,10 +674,10 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, return -BCH_ERR_ENOMEM_mark_stripe_ptr; } - mutex_lock(&c->ec_stripes_heap_lock); + gc_stripe_lock(m); if (!m || !m->alive) { - mutex_unlock(&c->ec_stripes_heap_lock); + gc_stripe_unlock(m); struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s", @@ -693,7 +693,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, .type = BCH_DISK_ACCOUNTING_replicas, }; memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e)); - mutex_unlock(&c->ec_stripes_heap_lock); + gc_stripe_unlock(m); acc.replicas.data_type = data_type; int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index a9acdd6c0c86..6aeec1c0973c 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -39,33 +39,6 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t for (_b = (_buckets)->b + (_buckets)->first_bucket; \ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -/* - * Ugly hack alert: - * - * We need to cram a spinlock in a single byte, because that's what we have left - * in struct bucket, and we care about the size of these - during fsck, we need - * in memory state for every single bucket on every device. - * - * We used to do - * while (xchg(&b->lock, 1) cpu_relax(); - * but, it turns out not all architectures support xchg on a single byte. - * - * So now we use bit_spin_lock(), with fun games since we can't burn a whole - * ulong for this - we just need to make sure the lock bit always ends up in the - * first byte. - */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define BUCKET_LOCK_BITNR 0 -#else -#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) -#endif - -union ulong_byte_assert { - ulong ulong; - u8 byte; -}; - static inline void bucket_unlock(struct bucket *b) { BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 7174047b8e92..900b8680c8b5 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -7,6 +7,33 @@ #define BUCKET_JOURNAL_SEQ_BITS 16 +/* + * Ugly hack alert: + * + * We need to cram a spinlock in a single byte, because that's what we have left + * in struct bucket, and we care about the size of these - during fsck, we need + * in memory state for every single bucket on every device. + * + * We used to do + * while (xchg(&b->lock, 1) cpu_relax(); + * but, it turns out not all architectures support xchg on a single byte. + * + * So now we use bit_spin_lock(), with fun games since we can't burn a whole + * ulong for this - we just need to make sure the lock bit always ends up in the + * first byte. + */ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define BUCKET_LOCK_BITNR 0 +#else +#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) +#endif + +union ulong_byte_assert { + ulong ulong; + u8 byte; +}; + struct bucket { u8 lock; u8 gen_valid:1; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 583ca6a226da..4c9511887655 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -132,6 +132,20 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, m->sectors); } +static inline void gc_stripe_unlock(struct gc_stripe *s) +{ + BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); + + clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); + wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR); +} + +static inline void gc_stripe_lock(struct gc_stripe *s) +{ + wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR, + TASK_UNINTERRUPTIBLE); +} + struct bch_read_bio; struct ec_stripe_buf { diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 8d1e70e830ac..37558cc2d89f 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -20,12 +20,11 @@ struct stripe { }; struct gc_stripe { + u8 lock; + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 sectors; - u8 nr_blocks; u8 nr_redundant; - - unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; From c7c07bf250cb0f391656f90bc8b11248df767ed3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Feb 2025 10:09:31 -0500 Subject: [PATCH 081/180] bcachefs: Better trigger ordering Transactional triggers need to run in a defined ordering, which is not quite the same as btree ID integer comparison. Previously this was handled in a hacky way in bch2_trans_commit_run_triggers(), since it was only the alloc btree that needed special handling, but upcoming stripe btree changes are going to require more ordering changes - so, define that ordering. Next patch will change the transaction commit path to use it. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_types.h | 13 +++++++++++++ fs/bcachefs/btree_update.c | 1 + 2 files changed, 14 insertions(+) diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a09cbe9cd94f..77578da2d23f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -423,6 +423,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) struct btree_insert_entry { unsigned flags; + u8 sort_order; u8 bkey_type; enum btree_id btree_id:8; u8 level:4; @@ -853,6 +854,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree) return BIT_ULL(btree) & mask; } +static inline u8 btree_trigger_order(enum btree_id btree) +{ + switch (btree) { + case BTREE_ID_alloc: + return U8_MAX; + case BTREE_ID_stripes: + return U8_MAX - 1; + default: + return btree; + } +} + struct btree_root { struct btree *b; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 13d794f201a5..47e54eedd0bc 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -397,6 +397,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, n = (struct btree_insert_entry) { .flags = flags, + .sort_order = btree_trigger_order(path->btree_id), .bkey_type = __btree_node_type(path->level, path->btree_id), .btree_id = path->btree_id, .level = path->level, From 65bc7688b8feb5511e62beb01acaa4bfb7016732 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Feb 2025 13:45:46 -0500 Subject: [PATCH 082/180] bcachefs: rework bch2_trans_commit_run_triggers() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 91 ++++++++++++-------------------- fs/bcachefs/btree_update.c | 2 +- 2 files changed, 35 insertions(+), 58 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index c4f524b2ca9a..892d20a50a52 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -336,6 +336,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->cached != path->cached); BUG_ON(i->level != path->level); BUG_ON(i->btree_id != path->btree_id); + BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && !(i->flags & BTREE_UPDATE_internal_snapshot_node) && @@ -517,69 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } } -static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - unsigned *btree_id_updates_start) -{ - bool trans_trigger_run; - - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; - - for (unsigned i = *btree_id_updates_start; - i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; - i++) { - if (trans->updates[i].btree_id < btree_id) { - *btree_id_updates_start = i; - continue; - } - - int ret = run_one_trans_trigger(trans, trans->updates + i); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && - i->btree_id == btree_id && - btree_node_type_has_trans_triggers(i->bkey_type) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); - - return 0; -} - static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - unsigned btree_id = 0, btree_id_updates_start = 0; - int ret = 0; + unsigned sort_id_start = 0; - /* - * - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being moved - * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before - * they are re-added. - */ - for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { - if (btree_id == BTREE_ID_alloc) - continue; + while (sort_id_start < trans->nr_updates) { + unsigned i, sort_id = trans->updates[sort_id_start].sort_order; + bool trans_trigger_run; - ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); - if (ret) - return ret; + /* + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being + * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop + * references before they are re-added. + * + * Running triggers will append more updates to the list of + * updates as we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = sort_id_start; + i < trans->nr_updates && trans->updates[i].sort_order <= sort_id; + i++) { + if (trans->updates[i].sort_order < sort_id) { + sort_id_start = i; + continue; + } + + int ret = run_one_trans_trigger(trans, trans->updates + i); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + + sort_id_start = i; } - btree_id_updates_start = 0; - ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); - if (ret) - return ret; - #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 47e54eedd0bc..b3e346b5f8d7 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -17,7 +17,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { - return cmp_int(l->btree_id, r->btree_id) ?: + return cmp_int(l->sort_order, r->sort_order) ?: cmp_int(l->cached, r->cached) ?: -cmp_int(l->level, r->level) ?: bpos_cmp(l->k->k.p, r->k->k.p); From 15800f3d4b0134a0f5abb9a1623921380d94c027 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 01:33:35 -0500 Subject: [PATCH 083/180] bcachefs: bcachefs_metadata_version_cached_backpointers Cached pointers now have backpointers. This means that we'll be able to kill cached pointers in the bucket_invalidate path, when invalidating/reusing buckets containing cached data, instead of leaving them around to be cleaned up by gc_gens garbago collection - which requires a full metadata scan. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 14 +++++++------- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/buckets.c | 8 +++----- fs/bcachefs/sb-downgrade.c | 5 ++++- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index bb799b86aa69..c9dfc3657696 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -611,9 +611,6 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.cached) - continue; - if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; @@ -621,9 +618,11 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); + + bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); rcu_read_unlock(); - if (check || empty) { + if ((check || empty) && !stale) { struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); @@ -857,9 +856,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b goto err; } - /* Cached pointers don't have backpointers: */ - if (sectors[ALLOC_dirty] != a->dirty_sectors || + sectors[ALLOC_cached] != a->cached_sectors || sectors[ALLOC_stripe] != a->stripe_sectors) { if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); @@ -868,6 +866,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b } if (sectors[ALLOC_dirty] > a->dirty_sectors || + sectors[ALLOC_cached] > a->cached_sectors || sectors[ALLOC_stripe] > a->stripe_sectors) { ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: -BCH_ERR_transaction_restart_nested; @@ -875,7 +874,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b } if (!sectors[ALLOC_dirty] && - !sectors[ALLOC_stripe]) + !sectors[ALLOC_stripe] && + !sectors[ALLOC_cached]) __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); else __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f70f0108401f..ef5009b18dd5 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -686,7 +686,8 @@ struct bch_sb_field_ext { x(inode_depth, BCH_VERSION(1, 17)) \ x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ x(autofix_errors, BCH_VERSION(1, 19)) \ - x(directory_size, BCH_VERSION(1, 20)) + x(directory_size, BCH_VERSION(1, 20)) \ + x(cached_backpointers, BCH_VERSION(1, 21)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 88af61bc799d..bb7742cf0014 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -590,11 +590,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (ret) goto err; - if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); - if (ret) - goto err; - } + ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); + if (ret) + goto err; } if (flags & BTREE_TRIGGER_gc) { diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 051214fdc735..ef985c851300 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -90,7 +90,10 @@ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_accounting_mismatch, \ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(cached_backpointers, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_ptr_to_missing_backpointer) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ From 942a418c7a45a970a3cf08e8b879865838f5488f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 18:12:57 -0500 Subject: [PATCH 084/180] bcachefs: Invalidate cached data by backpointers If we don't leave stale pointers around, we won't have to deal with bucket gen wraparound. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 102 +++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 58cdb6a0acf9..97c2df18dfa4 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2055,16 +2055,71 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } +static int invalidate_one_bp(struct btree_trans *trans, + struct bch_dev *ca, + struct bkey_s_c_backpointer bp, + struct bkey_buf *last_flushed) +{ + struct btree_iter extent_iter; + struct bkey_s_c extent_k = + bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed); + int ret = bkey_err(extent_k); + if (ret) + return ret; + + struct bkey_i *n = + bch2_bkey_make_mut(trans, &extent_iter, &extent_k, + BTREE_UPDATE_internal_snapshot_node); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx); +err: + bch2_trans_iter_exit(trans, &extent_iter); + return ret; +} + +static int invalidate_one_bucket_by_bps(struct btree_trans *trans, + struct bch_dev *ca, + struct bpos bucket, + u8 gen, + struct bkey_buf *last_flushed) +{ + struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket); + struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket); + + return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, + bp_start, bp_end, 0, k, + NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, ({ + if (k.k->type != KEY_TYPE_backpointer) + continue; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + if (bp.v->bucket_gen != gen) + continue; + + /* filter out bps with gens that don't match */ + + invalidate_one_bp(trans, ca, bp, last_flushed); + })); +} + +noinline_for_stack static int invalidate_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, struct btree_iter *lru_iter, struct bkey_s_c lru_k, + struct bkey_buf *last_flushed, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; - struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); - unsigned cached_sectors; + struct btree_iter alloc_iter = {}; int ret = 0; if (*nr_to_invalidate <= 0) @@ -2081,13 +2136,18 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate); - ret = PTR_ERR_OR_ZERO(a); + struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, + BTREE_ID_alloc, bucket, + BTREE_ITER_cached); + ret = bkey_err(alloc_k); if (ret) - goto out; + return ret; + + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); /* We expect harmless races here due to the btree write buffer: */ - if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) + if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) goto out; /* @@ -2097,26 +2157,16 @@ static int invalidate_one_bucket(struct btree_trans *trans, * * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 */ - BUG_ON(a->v.data_type != BCH_DATA_cached); - BUG_ON(a->v.dirty_sectors); + BUG_ON(a->data_type != BCH_DATA_cached); + BUG_ON(a->dirty_sectors); - if (!a->v.cached_sectors) + if (!a->cached_sectors) bch_err(c, "invalidating empty bucket, confused"); - cached_sectors = a->v.cached_sectors; + unsigned cached_sectors = a->cached_sectors; + u8 gen = a->gen; - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - a->v.gen++; - a->v.data_type = 0; - a->v.dirty_sectors = 0; - a->v.stripe_sectors = 0; - a->v.cached_sectors = 0; - a->v.io_time[READ] = bch2_current_io_time(c, READ); - a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); - - ret = bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); if (ret) goto out; @@ -2124,6 +2174,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, --*nr_to_invalidate; out: fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; } @@ -2150,6 +2201,10 @@ static void bch2_do_invalidates_work(struct work_struct *work) struct btree_trans *trans = bch2_trans_get(c); int ret = 0; + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + ret = bch2_btree_write_buffer_tryflush(trans); if (ret) goto err; @@ -2174,7 +2229,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (!k.k) break; - ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate); restart_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -2187,6 +2242,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) err: bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); + bch2_bkey_buf_exit(&last_flushed, c); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } From 69bd8a927702cec62b023948be22ac817d2643a7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 19:56:11 -0500 Subject: [PATCH 085/180] bcachefs: Advance bch_alloc.oldest_gen if no stale pointers Now that we've got cached backpointers and aren't leaving around stale pointers on bucket invalidation, we no longer need the periodic (rare) gc_gens - which recalculates each bucket's oldest gen to avoid wraparound. We can't delete that code because we've got to support existing filesystems that will still have stale pointers, but this gets rid of another scalability limit. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 97c2df18dfa4..c5c8497a6339 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -871,6 +871,9 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (data_type_is_empty(new_a->data_type) && BCH_ALLOC_V4_NEED_INC_GEN(new_a) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { + if (new_a->oldest_gen == new_a->gen && + !bch2_bucket_sectors_total(*new_a)) + new_a->oldest_gen++; new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); alloc_data_type_set(new_a, new_a->data_type); From 88d961b518826e5e98e171d876b87b642f12de6a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 01:34:00 -0500 Subject: [PATCH 086/180] bcachefs: bcachefs_metadata_version_stripe_backpointers Stripes now have backpointers. This is needed for proper scrub - stripe checksums need to be verified, separately from extents within the stripe, since a block may not be full of live extents but it's still needed for reconstruct. And this will be needed for (efficient) evacuate/repair paths. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.h | 15 ++++++++++++++- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/ec.c | 16 ++++++++++++++-- fs/bcachefs/move.c | 3 +++ fs/bcachefs/sb-downgrade.c | 3 +++ 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 7786731d4ada..16575dbc5736 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -152,7 +152,20 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bkey_i_backpointer *bp) { bkey_backpointer_init(&bp->k_i); - bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); + bp->k.p.inode = p.ptr.dev; + + if (k.k->type != KEY_TYPE_stripe) + bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset; + else { + /* + * Put stripe backpointers where they won't collide with the + * extent backpointers within the stripe: + */ + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1; + } + bp->v = (struct bch_backpointer) { .btree_id = btree_id, .level = level, diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index ef5009b18dd5..bf3723a2bca4 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -687,7 +687,8 @@ struct bch_sb_field_ext { x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ x(autofix_errors, BCH_VERSION(1, 19)) \ x(directory_size, BCH_VERSION(1, 20)) \ - x(cached_backpointers, BCH_VERSION(1, 21)) + x(cached_backpointers, BCH_VERSION(1, 21)) \ + x(stripe_backpointers, BCH_VERSION(1, 22)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 1aa56d28de33..36590c0ce09f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -298,10 +298,22 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bpos bucket = PTR_BUCKET_POS(ca, ptr); if (flags & BTREE_TRIGGER_transactional) { + struct extent_ptr_decoded p = { + .ptr = *ptr, + .crc = bch2_extent_crc_unpack(s.k, NULL), + }; + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p, + (const union bch_extent_entry *) ptr, &bp); + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: + bch2_bucket_backpointer_mod(trans, s.s_c, &bp, + !(flags & BTREE_TRIGGER_overwrite)); + if (ret) + goto err; } if (flags & BTREE_TRIGGER_gc) { diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 12519181026f..ee489d222fba 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -774,6 +774,9 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (!(data_types & BIT(bp.v->data_type))) goto next; + if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes) + goto next; + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index ef985c851300..acb5d845841e 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -92,6 +92,9 @@ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ BCH_FSCK_ERR_accounting_key_junk_at_end) \ x(cached_backpointers, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(stripe_backpointers, \ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ BCH_FSCK_ERR_ptr_to_missing_backpointer) From 6756e385a5bdf2e048ce2894208af9497062dcb9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Feb 2025 21:31:03 -0500 Subject: [PATCH 087/180] bcachefs: bcachefs_metadata_version_stripe_lru Add a persistent LRU for stripes, ordered by "number of empty blocks", i.e. order in which we wish to reuse them. This will replace the in-memory stripes heap, so we can kill off reading stripes into memory at startup. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 3 +- fs/bcachefs/bcachefs_format.h | 3 +- fs/bcachefs/ec.c | 51 ++++++++++++++++++++++++++++++++++ fs/bcachefs/ec.h | 27 ++++++++++++++++++ fs/bcachefs/lru.c | 7 +++++ fs/bcachefs/lru.h | 9 ++++-- fs/bcachefs/lru_format.h | 4 ++- 7 files changed, 99 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index c5c8497a6339..ecad4a78c3f7 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1757,7 +1757,8 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?: + bch2_check_stripe_to_lru_refs(c); bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index bf3723a2bca4..b4ac311f21a1 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -688,7 +688,8 @@ struct bch_sb_field_ext { x(autofix_errors, BCH_VERSION(1, 19)) \ x(directory_size, BCH_VERSION(1, 20)) \ x(cached_backpointers, BCH_VERSION(1, 21)) \ - x(stripe_backpointers, BCH_VERSION(1, 22)) + x(stripe_backpointers, BCH_VERSION(1, 22)) \ + x(stripe_lru, BCH_VERSION(1, 23)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 36590c0ce09f..1090cdb7d5cc 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -20,6 +20,7 @@ #include "io_read.h" #include "io_write.h" #include "keylist.h" +#include "lru.h" #include "recovery.h" #include "replicas.h" #include "super-io.h" @@ -411,6 +412,15 @@ int bch2_trigger_stripe(struct btree_trans *trans, (new_s->nr_blocks != old_s->nr_blocks || new_s->nr_redundant != old_s->nr_redundant)); + if (flags & BTREE_TRIGGER_transactional) { + int ret = bch2_lru_change(trans, + BCH_LRU_STRIPE_FRAGMENTATION, + idx, + stripe_lru_pos(old_s), + stripe_lru_pos(new_s)); + if (ret) + return ret; + } if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { /* @@ -1175,6 +1185,10 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) return ret; } +/* + * XXX + * can we kill this and delete stripes from the trigger? + */ static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = @@ -2519,3 +2533,40 @@ int bch2_fs_ec_init(struct bch_fs *c) return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), BIOSET_NEED_BVECS); } + +static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, + struct bkey_s_c k, + struct bkey_buf *last_flushed) +{ + if (k.k->type != KEY_TYPE_stripe) + return 0; + + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + u64 lru_idx = stripe_lru_pos(s.v); + if (lru_idx) { + int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION, + k.k->p.offset, lru_idx, k, last_flushed); + if (ret) + return ret; + } + return 0; +} + +int bch2_check_stripe_to_lru_refs(struct bch_fs *c) +{ + struct bkey_buf last_flushed; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, + POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 4c9511887655..cd1c837e4933 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -92,6 +92,31 @@ static inline void stripe_csum_set(struct bch_stripe *s, memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); } +#define STRIPE_LRU_POS_EMPTY 1 + +static inline u64 stripe_lru_pos(const struct bch_stripe *s) +{ + if (!s) + return 0; + + unsigned blocks_empty = 0, blocks_nonempty = 0; + + for (unsigned i = 0; i < s->nr_blocks; i++) { + blocks_empty += !stripe_blockcount_get(s, i); + blocks_nonempty += !!stripe_blockcount_get(s, i); + } + + /* Will be picked up by the stripe_delete worker */ + if (!blocks_nonempty) + return STRIPE_LRU_POS_EMPTY; + + if (!blocks_empty) + return 0; + + /* invert: more blocks empty = reuse first */ + return LRU_TIME_MAX - blocks_empty; +} + static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, const struct bch_extent_ptr *data_ptr, unsigned sectors) @@ -282,4 +307,6 @@ void bch2_fs_ec_exit(struct bch_fs *); void bch2_fs_ec_init_early(struct bch_fs *); int bch2_fs_ec_init(struct bch_fs *); +int bch2_check_stripe_to_lru_refs(struct bch_fs *); + #endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 98ab8496f29d..a299d9ec8ee4 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -6,6 +6,7 @@ #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" +#include "ec.h" #include "error.h" #include "lru.h" #include "recovery.h" @@ -124,6 +125,8 @@ static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) case BCH_LRU_read: case BCH_LRU_fragmentation: return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset)); + case BCH_LRU_stripes: + return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset)); default: BUG(); } @@ -151,6 +154,10 @@ static u64 bkey_lru_type_idx(struct bch_fs *c, rcu_read_unlock(); return idx; } + case BCH_LRU_stripes: + return k.k->type == KEY_TYPE_stripe + ? stripe_lru_pos(bkey_s_c_to_stripe(k).v) + : 0; default: BUG(); } diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index dea1d75cc9c1..8abd0aa2083a 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -28,9 +28,14 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; - if (lru_id == BCH_LRU_BUCKET_FRAGMENTATION) + switch (lru_id) { + case BCH_LRU_BUCKET_FRAGMENTATION: return BCH_LRU_fragmentation; - return BCH_LRU_read; + case BCH_LRU_STRIPE_FRAGMENTATION: + return BCH_LRU_stripes; + default: + return BCH_LRU_read; + } } int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h index 353a352d3fb9..b7392ad8e41f 100644 --- a/fs/bcachefs/lru_format.h +++ b/fs/bcachefs/lru_format.h @@ -9,7 +9,8 @@ struct bch_lru { #define BCH_LRU_TYPES() \ x(read) \ - x(fragmentation) + x(fragmentation) \ + x(stripes) enum bch_lru_type { #define x(n) BCH_LRU_##n, @@ -18,6 +19,7 @@ enum bch_lru_type { }; #define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1) +#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2) #define LRU_TIME_BITS 48 #define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) From 68171d91cef2847a4aa3e532141510a591db9729 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Feb 2025 12:58:21 -0500 Subject: [PATCH 088/180] bcachefs: Kill dirent_occupied_size() in rename path Cc: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 12 ++++++++++-- fs/bcachefs/dirent.h | 4 ++-- fs/bcachefs/fs-common.c | 4 ++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 600eee936f13..17b767799cff 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -275,8 +275,8 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, + subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) @@ -406,6 +406,14 @@ int bch2_dirent_rename(struct btree_trans *trans, new_src->v.d_type == DT_SUBVOL) new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + if (old_dst.k) + *dst_dir_i_size -= bkey_bytes(old_dst.k); + *src_dir_i_size -= bkey_bytes(old_src.k); + + if (mode == BCH_RENAME_EXCHANGE) + *src_dir_i_size += bkey_bytes(&new_src->k); + *dst_dir_i_size += bkey_bytes(&new_dst->k); + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 362b3b2f2f2e..ec0ff2ea6275 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -62,8 +62,8 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, - subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, u64 *, + subvol_inum, struct bch_hash_info *, u64 *, const struct qstr *, subvol_inum *, u64 *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 2c3d46ac70c6..47e521e64229 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -417,8 +417,8 @@ int bch2_rename_trans(struct btree_trans *trans, } ret = bch2_dirent_rename(trans, - src_dir, &src_hash, - dst_dir, &dst_hash, + src_dir, &src_hash, &src_dir_u->bi_size, + dst_dir, &dst_hash, &dst_dir_u->bi_size, src_name, &src_inum, &src_offset, dst_name, &dst_inum, &dst_offset, mode); From 72f4edcf452ce173b292e3c1817195e5f834b9de Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Feb 2025 13:15:50 -0500 Subject: [PATCH 089/180] bcachefs: Kill dirent_occupied_size() in create path Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 3 +++ fs/bcachefs/dirent.h | 2 +- fs/bcachefs/fs-common.c | 22 +++++++++++----------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 17b767799cff..27737aaa03a6 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -233,6 +233,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, + u64 *i_size, enum btree_iter_update_trigger_flags flags) { struct bkey_i_dirent *dirent; @@ -243,6 +244,8 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, if (ret) return ret; + *i_size += bkey_bytes(&dirent->k); + ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index ec0ff2ea6275..37f01c1a3f7f 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -47,7 +47,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, enum btree_iter_update_trigger_flags); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, + const struct qstr *, u64, u64 *, u64 *, enum btree_iter_update_trigger_flags); static inline unsigned vfs_d_type(unsigned type) diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 47e521e64229..1d454333afa2 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -153,16 +153,14 @@ int bch2_create_trans(struct btree_trans *trans, dir_u->bi_nlink++; dir_u->bi_mtime = dir_u->bi_ctime = now; - ret = bch2_inode_write(trans, &dir_iter, dir_u); - if (ret) - goto err; - - ret = bch2_dirent_create(trans, dir, &dir_hash, - dir_type, - name, - dir_target, - &dir_offset, - STR_HASH_must_create|BTREE_ITER_with_updates); + ret = bch2_dirent_create(trans, dir, &dir_hash, + dir_type, + name, + dir_target, + &dir_offset, + &dir_u->bi_size, + STR_HASH_must_create|BTREE_ITER_with_updates) ?: + bch2_inode_write(trans, &dir_iter, dir_u); if (ret) goto err; @@ -225,7 +223,9 @@ int bch2_link_trans(struct btree_trans *trans, ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), - name, inum.inum, &dir_offset, + name, inum.inum, + &dir_offset, + &dir_u->bi_size, STR_HASH_must_create); if (ret) goto err; From 76872d46b7fa8b6dc8303de4ed45a9ac6ed92f91 Mon Sep 17 00:00:00 2001 From: Joshua Ashton Date: Sun, 13 Aug 2023 17:49:12 +0100 Subject: [PATCH 090/180] bcachefs: Split out dirent alloc and name initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Splits out the code that allocates the dirent and initializes the name to make things easier to implement casefolding in a future commit. Cc: André Almeida Cc: Gabriel Krisman Bertazi Signed-off-by: Joshua Ashton Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 46 ++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 27737aaa03a6..7dcc18000726 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -163,15 +163,13 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } -static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, - subvol_inum dir, u8 type, - const struct qstr *name, u64 dst) +static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, + subvol_inum dir, + u8 type, + int name_len, u64 dst) { struct bkey_i_dirent *dirent; - unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); - - if (name->len > BCH_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len); BUG_ON(u64s > U8_MAX); @@ -191,11 +189,35 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, dirent->v.d_type = type; - memcpy(dirent->v.d_name, name->name, name->len); - memset(dirent->v.d_name + name->len, 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); + return dirent; +} + +static void dirent_init_regular_name(struct bkey_i_dirent *dirent, + const struct qstr *name) +{ + memcpy(&dirent->v.d_name[0], name->name, name->len); + memset(&dirent->v.d_name[name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_name) - + name->len); +} + +static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + subvol_inum dir, + u8 type, + const struct qstr *name, + u64 dst) +{ + struct bkey_i_dirent *dirent; + + if (name->len > BCH_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + dirent = dirent_alloc_key(trans, dir, type, name->len, dst); + if (IS_ERR(dirent)) + return dirent; + + dirent_init_regular_name(dirent, name); EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); From d37c14ac6f05ec98db9b3d9db424dc73a0f5b1cd Mon Sep 17 00:00:00 2001 From: Joshua Ashton Date: Sun, 13 Aug 2023 18:34:17 +0100 Subject: [PATCH 091/180] bcachefs: bcachefs_metadata_version_casefolding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements support for case-insensitive file name lookups in bcachefs. The implementation uses the same UTF-8 lowering and normalization that ext4 and f2fs is using. More information is provided in Documentation/bcachefs/casefolding.rst Compatibility notes: This uses the new versioning scheme for incompatible features where an incompatible feature is tied to a version number: the superblock says "we may use incompat features up to x" and "incompat features up to x are in use", disallowing mounting by previous versions. Additionally, and old style incompat feature bit is used, so that kernels without utf8 casefolding support know if casefolding specifically is in use and they're allowed to mount. Signed-off-by: Joshua Ashton Cc: André Almeida Cc: Gabriel Krisman Bertazi Signed-off-by: Kent Overstreet --- .../filesystems/bcachefs/casefolding.rst | 87 +++++++++ fs/bcachefs/bcachefs.h | 6 + fs/bcachefs/bcachefs_format.h | 6 +- fs/bcachefs/dirent.c | 170 +++++++++++++++--- fs/bcachefs/dirent.h | 9 +- fs/bcachefs/dirent_format.h | 20 ++- fs/bcachefs/fs-common.c | 4 + fs/bcachefs/fs-ioctl.c | 25 +++ fs/bcachefs/fs-ioctl.h | 20 ++- fs/bcachefs/fs.c | 17 ++ fs/bcachefs/inode_format.h | 3 +- fs/bcachefs/sb-errors_format.h | 4 +- fs/bcachefs/str_hash.c | 2 +- fs/bcachefs/str_hash.h | 4 + fs/bcachefs/super.c | 19 ++ 15 files changed, 354 insertions(+), 42 deletions(-) create mode 100644 Documentation/filesystems/bcachefs/casefolding.rst diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst new file mode 100644 index 000000000000..6546aa4f7a86 --- /dev/null +++ b/Documentation/filesystems/bcachefs/casefolding.rst @@ -0,0 +1,87 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Casefolding +=========== + +bcachefs has support for case-insensitive file and directory +lookups using the regular `chattr +F` (`S_CASEFOLD`, `FS_CASEFOLD_FL`) +casefolding attributes. + +The main usecase for casefolding is compatibility with software written +against other filesystems that rely on casefolded lookups +(eg. NTFS and Wine/Proton). +Taking advantage of file-system level casefolding can lead to great +loading time gains in many applications and games. + +Casefolding support requires a kernel with the `CONFIG_UNICODE` enabled. +Once a directory has been flagged for casefolding, a feature bit +is enabled on the superblock which marks the filesystem as using +casefolding. +When the feature bit for casefolding is enabled, it is no longer possible +to mount that filesystem on kernels without `CONFIG_UNICODE` enabled. + +On the lookup/query side: casefolding is implemented by allocating a new +string of `BCH_NAME_MAX` length using the `utf8_casefold` function to +casefold the query string. + +On the dirent side: casefolding is implemented by ensuring the `bkey`'s +hash is made from the casefolded string and storing the cached casefolded +name with the regular name in the dirent. + +The structure looks like this: + +Regular: [dirent data][regular name][nul][nul]... +Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]... + +(Do note, the number of `NUL`s here is merely for illustration, they count can vary + per-key, and they may not even be present if the key is aligned to `sizeof(u64)`.) + +This is efficient as it means that for all file lookups that require casefolding, +it has identical performance to a regular lookup: +a hash comparison and a `memcmp` of the name. + +Rationale +--------- + +Several designs were considered for this system: +One was to introduce a dirent_v2, however that would be painful especially as +the hash system only has support for a single key type. This would also need +`BCH_NAME_MAX` to change between versions, and a new feature bit. + +Another option was to store without the two lengths, and just take the length of +the regular name and casefolded name contiguously / 2 as the length. This would +assume that the regular length == casefolded length, but that could potentially +not be true, if the uppercase unicode glyph had a different UTF-8 encoding than +the lowercase unicode glyph. +It would be possible to disregard the casefold cache for those cases, but it was +decided to simply encode the two string lengths in the key to avoid random +performance issues if this edgecase was ever hit. + +The option settled on was to use a free-bit in d_type to mark a dirent as having +a casefold cache, and then treat the first 4 bytes the name block as lengths. +You can see this in the `d_cf_name_block` member of union in `bch_dirent`. + +The feature bit was used to allow casefolding support to be enabled for the majority +of users, but some allow users who have no need for the feature to still use bcachefs as +`CONFIG_UNICODE` can increase the kernel side a significant amount due to the tables used, +which may be decider between using bcachefs for eg. embedded platforms. + +Other filesystems like ext4 and f2fs have a super-block level option for casefolding +encoding, but bcachefs currently does not provide this. ext4 and f2fs do not expose +any encodings than a single UTF-8 version. When future encodings are desirable, +they will be added trivially using the opts mechanism. + +dentry/dcache considerations +--------- + +Currently, in casefolded directories, bcachefs (like other filesystems) will not cache +negative dentry's. + +This is because currently doing so presents a problem in the following scenario: + - Lookup file "blAH" in a casefolded directory + - Creation of file "BLAH" in a casefolded directory + - Lookup file "blAH" in a casefolded directory +This would fail if negative dentry's were cached. + +This is slightly suboptimal, but could be fixed in future with some vfs work. + diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index e8f4999806b6..d2c3f59a668f 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -203,6 +203,7 @@ #include #include #include +#include #include "bcachefs_format.h" #include "btree_journal_iter_types.h" @@ -699,6 +700,8 @@ enum bch_write_ref { BCH_WRITE_REF_NR, }; +#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0) + struct bch_fs { struct closure cl; @@ -783,6 +786,9 @@ struct bch_fs { u64 btrees_lost_data; } sb; +#ifdef CONFIG_UNICODE + struct unicode_map *cf_encoding; +#endif struct bch_sb_handle disk_sb; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b4ac311f21a1..13cc0833b488 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -689,7 +689,8 @@ struct bch_sb_field_ext { x(directory_size, BCH_VERSION(1, 20)) \ x(cached_backpointers, BCH_VERSION(1, 21)) \ x(stripe_backpointers, BCH_VERSION(1, 22)) \ - x(stripe_lru, BCH_VERSION(1, 23)) + x(stripe_lru, BCH_VERSION(1, 23)) \ + x(casefolding, BCH_VERSION(1, 24)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -911,7 +912,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(journal_no_flush, 16) \ x(alloc_v2, 17) \ x(extents_across_btree_nodes, 18) \ - x(incompat_version_field, 19) + x(incompat_version_field, 19) \ + x(casefolding, 20) #define BCH_SB_FEATURES_ALWAYS \ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 7dcc18000726..f4c283d1e86a 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -13,6 +13,40 @@ #include +static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + *out_cf = (struct qstr) QSTR_INIT(NULL, 0); + +#ifdef CONFIG_UNICODE + unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); + int ret = PTR_ERR_OR_ZERO(buf); + if (ret) + return ret; + + ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1); + if (ret <= 0) + return ret; + + *out_cf = (struct qstr) QSTR_INIT(buf, ret); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static inline int bch2_maybe_casefold(struct btree_trans *trans, + const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + if (likely(!info->cf_encoding)) { + *out_cf = *str; + return 0; + } else { + return bch2_casefold(trans, info, str, out_cf); + } +} + static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) @@ -28,13 +62,38 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) #endif return bkey_bytes - - offsetof(struct bch_dirent, d_name) - + (d.v->d_casefold + ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + : offsetof(struct bch_dirent, d_name)) - trailing_nuls; } struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) { - return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + if (d.v->d_casefold) { + unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); + return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len); + } else { + return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + } +} + +static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d) +{ + if (d.v->d_casefold) { + unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); + unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len); + return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len); + } else { + return (struct qstr) QSTR_INIT(NULL, 0); + } +} + +static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d) +{ + return d.v->d_casefold + ? bch2_dirent_get_casefold_name(d) + : bch2_dirent_get_name(d); } static u64 bch2_dirent_hash(const struct bch_hash_info *info, @@ -57,7 +116,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_name(d); + struct qstr name = bch2_dirent_get_lookup_name(d); return bch2_dirent_hash(info, &name); } @@ -65,7 +124,7 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) { struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - const struct qstr l_name = bch2_dirent_get_name(l); + const struct qstr l_name = bch2_dirent_get_lookup_name(l); const struct qstr *r_name = _r; return !qstr_eq(l_name, *r_name); @@ -75,8 +134,8 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) { struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); - const struct qstr l_name = bch2_dirent_get_name(l); - const struct qstr r_name = bch2_dirent_get_name(r); + const struct qstr l_name = bch2_dirent_get_lookup_name(l); + const struct qstr r_name = bch2_dirent_get_lookup_name(r); return !qstr_eq(l_name, r_name); } @@ -104,17 +163,19 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + unsigned name_block_len = bch2_dirent_name_bytes(d); struct qstr d_name = bch2_dirent_get_name(d); + struct qstr d_cf_name = bch2_dirent_get_casefold_name(d); int ret = 0; bkey_fsck_err_on(!d_name.len, c, dirent_empty_name, "empty name"); - bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), + bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len, c, dirent_val_too_big, - "value too big (%zu > %u)", - bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); + "dirent names exceed bkey size (%d + %d > %d)", + d_name.len, d_cf_name.len, name_block_len); /* * Check new keys don't exceed the max length @@ -142,6 +203,18 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, dirent_to_itself, "dirent points to own directory"); + + if (d.v->d_casefold) { + bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit && + d_cf_name.len > BCH_NAME_MAX, + c, dirent_cf_name_too_big, + "dirent w/ cf name too big (%u > %u)", + d_cf_name.len, BCH_NAME_MAX); + + bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len), + c, dirent_stray_data_after_cf_name, + "dirent has stray data after cf name's NUL"); + } fsck_err: return ret; } @@ -166,10 +239,11 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, subvol_inum dir, u8 type, - int name_len, u64 dst) + int name_len, int cf_name_len, + u64 dst) { struct bkey_i_dirent *dirent; - unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len); + unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len); BUG_ON(u64s > U8_MAX); @@ -188,6 +262,8 @@ static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, } dirent->v.d_type = type; + dirent->v.d_unused = 0; + dirent->v.d_casefold = cf_name_len ? 1 : 0; return dirent; } @@ -195,6 +271,8 @@ static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, static void dirent_init_regular_name(struct bkey_i_dirent *dirent, const struct qstr *name) { + EBUG_ON(dirent->v.d_casefold); + memcpy(&dirent->v.d_name[0], name->name, name->len); memset(&dirent->v.d_name[name->len], 0, bkey_val_bytes(&dirent->k) - @@ -202,10 +280,30 @@ static void dirent_init_regular_name(struct bkey_i_dirent *dirent, name->len); } +static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, + const struct qstr *name, + const struct qstr *cf_name) +{ + EBUG_ON(!dirent->v.d_casefold); + EBUG_ON(!cf_name->len); + + dirent->v.d_cf_name_block.d_name_len = name->len; + dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len; + memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); + memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); + memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_cf_name_block.d_names) - + name->len + cf_name->len); + + EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len); +} + static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, subvol_inum dir, u8 type, const struct qstr *name, + const struct qstr *cf_name, u64 dst) { struct bkey_i_dirent *dirent; @@ -213,13 +311,16 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, if (name->len > BCH_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - dirent = dirent_alloc_key(trans, dir, type, name->len, dst); + dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); if (IS_ERR(dirent)) return dirent; - dirent_init_regular_name(dirent, name); + if (cf_name) + dirent_init_casefolded_name(dirent, name, cf_name); + else + dirent_init_regular_name(dirent, name); - EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); + EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); return dirent; } @@ -235,7 +336,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); + dirent = dirent_create_key(trans, dir_inum, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -261,7 +362,16 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, dir, type, name, dst_inum); + if (hash_info->cf_encoding) { + struct qstr cf_name; + ret = bch2_casefold(trans, hash_info, name, &cf_name); + if (ret) + return ret; + dirent = dirent_create_key(trans, dir, type, name, &cf_name, dst_inum); + } else { + dirent = dirent_create_key(trans, dir, type, name, NULL, dst_inum); + } + ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -306,6 +416,7 @@ int bch2_dirent_rename(struct btree_trans *trans, const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) { + struct qstr src_name_lookup, dst_name_lookup; struct btree_iter src_iter = { NULL }; struct btree_iter dst_iter = { NULL }; struct bkey_s_c old_src, old_dst = bkey_s_c_null; @@ -320,8 +431,11 @@ int bch2_dirent_rename(struct btree_trans *trans, memset(dst_inum, 0, sizeof(*dst_inum)); /* Lookup src: */ + ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup); + if (ret) + goto out; old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, src_name, + src_hash, src_dir, &src_name_lookup, BTREE_ITER_intent); ret = bkey_err(old_src); if (ret) @@ -333,6 +447,9 @@ int bch2_dirent_rename(struct btree_trans *trans, goto out; /* Lookup dst: */ + ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup); + if (ret) + goto out; if (mode == BCH_RENAME) { /* * Note that we're _not_ checking if the target already exists - @@ -340,12 +457,12 @@ int bch2_dirent_rename(struct btree_trans *trans, * correctness: */ ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name); + dst_hash, dst_dir, &dst_name_lookup); if (ret) goto out; } else { old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name, + dst_hash, dst_dir, &dst_name_lookup, BTREE_ITER_intent); ret = bkey_err(old_dst); if (ret) @@ -361,7 +478,8 @@ int bch2_dirent_rename(struct btree_trans *trans, *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); + new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, + dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) goto out; @@ -371,7 +489,8 @@ int bch2_dirent_rename(struct btree_trans *trans, /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); + new_src = dirent_create_key(trans, src_dir, 0, src_name, + src_hash->cf_encoding ? &src_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) goto out; @@ -498,9 +617,14 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, const struct qstr *name, subvol_inum *inum, unsigned flags) { + struct qstr lookup_name; + int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name); + if (ret) + return ret; + struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); - int ret = bkey_err(k); + hash_info, dir, &lookup_name, flags); + ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 37f01c1a3f7f..a6e15a012936 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -25,10 +25,13 @@ struct bch_inode_info; struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); -static inline unsigned dirent_val_u64s(unsigned len) +static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) { - return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, - sizeof(u64)); + unsigned bytes = cf_len + ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len + : offsetof(struct bch_dirent, d_name) + len; + + return DIV_ROUND_UP(bytes, sizeof(u64)); } int bch2_dirent_read_target(struct btree_trans *, subvol_inum, diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h index 5e116b88e814..a46dbddd21aa 100644 --- a/fs/bcachefs/dirent_format.h +++ b/fs/bcachefs/dirent_format.h @@ -29,9 +29,25 @@ struct bch_dirent { * Copy of mode bits 12-15 from the target inode - so userspace can get * the filetype without having to do a stat() */ - __u8 d_type; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 d_type:5, + d_unused:2, + d_casefold:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 d_casefold:1, + d_unused:2, + d_type:5; +#endif - __u8 d_name[]; + union { + struct { + __u8 d_pad; + __le16 d_name_len; + __le16 d_cf_name_len; + __u8 d_names[]; + } d_cf_name_block __packed; + __DECLARE_FLEX_ARRAY(__u8, d_name); + } __packed; } __packed __aligned(8); #define DT_SUBVOL 16 diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 1d454333afa2..fbc3da59536c 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -47,6 +47,10 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; + /* Inherit casefold state from parent. */ + if (S_ISDIR(mode)) + new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded; + if (!(flags & BCH_CREATE_SNAPSHOT)) { /* Normal create path - allocate a new inode: */ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 15725b4ce393..4465a2a821e3 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -54,6 +54,31 @@ static int bch2_inode_flags_set(struct btree_trans *trans, (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) return -EINVAL; + if ((newflags ^ oldflags) & BCH_INODE_casefolded) { +#ifdef CONFIG_UNICODE + int ret = 0; + /* Not supported on individual files. */ + if (!S_ISDIR(bi->bi_mode)) + return -EOPNOTSUPP; + + /* + * Make sure the dir is empty, as otherwise we'd need to + * rehash everything and update the dirent keys. + */ + ret = bch2_empty_dir_trans(trans, inode_inum(inode)); + if (ret < 0) + return ret; + + if (!bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding)) + return -EOPNOTSUPP; + + bch2_check_set_feature(c, BCH_FEATURE_casefolding); +#else + printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); + return -EOPNOTSUPP; +#endif + } + if (s->set_projinherit) { bi->bi_fields_set &= ~(1 << Inode_opt_project); bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h index d30f9bb056fd..ecd3bfdcde21 100644 --- a/fs/bcachefs/fs-ioctl.h +++ b/fs/bcachefs/fs-ioctl.h @@ -6,19 +6,21 @@ /* bcachefs inode flags -> vfs inode flags: */ static const __maybe_unused unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_sync] = S_SYNC, - [__BCH_INODE_immutable] = S_IMMUTABLE, - [__BCH_INODE_append] = S_APPEND, - [__BCH_INODE_noatime] = S_NOATIME, + [__BCH_INODE_sync] = S_SYNC, + [__BCH_INODE_immutable] = S_IMMUTABLE, + [__BCH_INODE_append] = S_APPEND, + [__BCH_INODE_noatime] = S_NOATIME, + [__BCH_INODE_casefolded] = S_CASEFOLD, }; /* bcachefs inode flags -> FS_IOC_GETFLAGS: */ static const __maybe_unused unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_sync] = FS_SYNC_FL, - [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, - [__BCH_INODE_append] = FS_APPEND_FL, - [__BCH_INODE_nodump] = FS_NODUMP_FL, - [__BCH_INODE_noatime] = FS_NOATIME_FL, + [__BCH_INODE_sync] = FS_SYNC_FL, + [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, + [__BCH_INODE_append] = FS_APPEND_FL, + [__BCH_INODE_nodump] = FS_NODUMP_FL, + [__BCH_INODE_noatime] = FS_NOATIME_FL, + [__BCH_INODE_casefolded] = FS_CASEFOLD_FL, }; /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 5d910f1c671c..2c011a465588 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -698,6 +698,23 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, if (IS_ERR(inode)) inode = NULL; +#ifdef CONFIG_UNICODE + if (!inode && IS_CASEFOLDED(vdir)) { + /* + * Do not cache a negative dentry in casefolded directories + * as it would need to be invalidated in the following situation: + * - Lookup file "blAH" in a casefolded directory + * - Creation of file "BLAH" in a casefolded directory + * - Lookup file "blAH" in a casefolded directory + * which would fail if we had a negative dentry. + * + * We should come back to this when VFS has a method to handle + * this edgecase. + */ + return NULL; + } +#endif + return d_splice_alias(&inode->v, dentry); } diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index b99a5bf1a75e..117110af1e3f 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -137,7 +137,8 @@ enum inode_opt_id { x(i_sectors_dirty, 6) \ x(unlinked, 7) \ x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) + x(has_child_snapshot, 9) \ + x(casefolded, 10) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index b86ec013d7d7..cdafd877b8a1 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -314,7 +314,9 @@ enum bch_fsck_flags { x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ - x(MAX, 304, 0) + x(dirent_cf_name_too_big, 304, 0) \ + x(dirent_stray_data_after_cf_name, 305, 0) \ + x(MAX, 306, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index d78451c2a0c6..93e71119e5a4 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -50,7 +50,7 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans, for (unsigned i = 0; i < 1000; i++) { unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len); + unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0); if (u64s > U8_MAX) return -EINVAL; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 55a4ac7bf220..f645a4547b04 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -34,6 +34,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) struct bch_hash_info { u8 type; + struct unicode_map *cf_encoding; /* * For crc32 or crc64 string hashes the first key value of * the siphash_key (k0) is used as the key. @@ -47,6 +48,9 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) /* XXX ick */ struct bch_hash_info info = { .type = INODE_STR_HASH(bi), +#ifdef CONFIG_UNICODE + .cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL, +#endif .siphash_key = { .k0 = bi->bi_hash_seed } }; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 0459c875e189..11877aea38ec 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -837,6 +837,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; +#ifdef CONFIG_UNICODE + /* Default encoding until we can potentially have more as an option. */ + c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); + if (IS_ERR(c->cf_encoding)) { + printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + ret = -EINVAL; + goto err; + } +#else + if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { + printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); + ret = -EINVAL; + goto err; + } +#endif + pr_uuid(&name, c->sb.user_uuid.b); ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) From 24d790a7daa37093679874804842703cf13bf511 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Feb 2025 12:46:15 -0500 Subject: [PATCH 092/180] bcachefs: sysfs internal/trigger_btree_updates Add a debug knob to manually trigger the btree updates worker. Signed-off-by: Kent Overstreet --- fs/bcachefs/sysfs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index b3f2c651c1f8..a9953181c29b 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -146,6 +146,7 @@ write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); write_attribute(trigger_freelist_wakeup); +write_attribute(trigger_btree_updates); read_attribute(gc_gens_pos); read_attribute(uuid); @@ -411,6 +412,9 @@ STORE(bch2_fs) /* Debugging: */ + if (attr == &sysfs_trigger_btree_updates) + queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; @@ -580,6 +584,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_btree_cache_shrink, &sysfs_trigger_btree_key_cache_shrink, &sysfs_trigger_freelist_wakeup, + &sysfs_trigger_btree_updates, &sysfs_gc_gens_pos, From 82b5666912e643ce806460130c1fd05fe6354193 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Sat, 22 Feb 2025 16:18:50 +0700 Subject: [PATCH 093/180] Documentation: bcachefs: casefolding: Do not italicize NUL Sphinx reports htmldocs warning: Documentation/filesystems/bcachefs/casefolding.rst:36: WARNING: Inline interpreted text or phrase reference start-string without end-string. [docutils] That's because NUL word is italicized but it is written in plural form instead (`NUL`s). Sphinx, however, doesn't tip over when the italicized word in this fashion is followed by punctuation instead. Do not italicize the word to keep Sphinx happy. Fixes: bc5cc09246c5 ("bcachefs: bcachefs_metadata_version_casefolding") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250221162135.79be0147@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- Documentation/filesystems/bcachefs/casefolding.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst index 6546aa4f7a86..1c385b6d21a0 100644 --- a/Documentation/filesystems/bcachefs/casefolding.rst +++ b/Documentation/filesystems/bcachefs/casefolding.rst @@ -33,8 +33,9 @@ The structure looks like this: Regular: [dirent data][regular name][nul][nul]... Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]... -(Do note, the number of `NUL`s here is merely for illustration, they count can vary - per-key, and they may not even be present if the key is aligned to `sizeof(u64)`.) +(Do note, the number of NULs here is merely for illustration; their count can +vary per-key, and they may not even be present if the key is aligned to +`sizeof(u64)`.) This is efficient as it means that for all file lookups that require casefolding, it has identical performance to a regular lookup: From 210997859a3ce6e50adb6156de94be7084dcb92e Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Sat, 22 Feb 2025 16:18:51 +0700 Subject: [PATCH 094/180] Documentation: bcachefs: casefolding: Fix dentry/dcache considerations section Sphinx reports htmldocs warnings on dentry/dcache section: Documentation/filesystems/bcachefs/casefolding.rst:75: WARNING: Title underline too short. dentry/dcache considerations --------- [docutils] Documentation/filesystems/bcachefs/casefolding.rst:84: WARNING: Definition list ends without a blank line; unexpected unindent. [docutils] Fix the section by: * Extending the section underline to match the section title length; * Separating problem list from surrounding paragraphs. Fixes: bc5cc09246c5 ("bcachefs: bcachefs_metadata_version_casefolding") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250221161911.2d16138b@canb.auug.org.au/ Closes: https://lore.kernel.org/linux-next/20250221162135.79be0147@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- Documentation/filesystems/bcachefs/casefolding.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst index 1c385b6d21a0..d5861b444635 100644 --- a/Documentation/filesystems/bcachefs/casefolding.rst +++ b/Documentation/filesystems/bcachefs/casefolding.rst @@ -73,15 +73,17 @@ any encodings than a single UTF-8 version. When future encodings are desirable, they will be added trivially using the opts mechanism. dentry/dcache considerations ---------- +---------------------------- Currently, in casefolded directories, bcachefs (like other filesystems) will not cache negative dentry's. This is because currently doing so presents a problem in the following scenario: + - Lookup file "blAH" in a casefolded directory - Creation of file "BLAH" in a casefolded directory - Lookup file "blAH" in a casefolded directory + This would fail if negative dentry's were cached. This is slightly suboptimal, but could be fixed in future with some vfs work. From 47d4100b15c19f6a0b4594ca58eb5aab90703e96 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Sat, 22 Feb 2025 16:18:52 +0700 Subject: [PATCH 095/180] Documentation: bcachefs: casefolding: Use bullet list for dirent structure The doc lists dirent structure for both regular and casefolded names, yet it is written (and rendered) as long paragraph instead. Write the structure list as bullet list. Fixes: bc5cc09246c5 ("bcachefs: bcachefs_metadata_version_casefolding") Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- Documentation/filesystems/bcachefs/casefolding.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst index d5861b444635..ba5de97d155f 100644 --- a/Documentation/filesystems/bcachefs/casefolding.rst +++ b/Documentation/filesystems/bcachefs/casefolding.rst @@ -30,8 +30,8 @@ name with the regular name in the dirent. The structure looks like this: -Regular: [dirent data][regular name][nul][nul]... -Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]... +* Regular: [dirent data][regular name][nul][nul]... +* Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]... (Do note, the number of NULs here is merely for illustration; their count can vary per-key, and they may not even be present if the key is aligned to From 7442ef708254973996db93e0a5f7d390941a24c5 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Sat, 22 Feb 2025 16:18:53 +0700 Subject: [PATCH 096/180] Documentation: bcachefs: Add casefolding toctree entry Sphinx reports htmldocs toctree warning: Documentation/filesystems/bcachefs/casefolding.rst: WARNING: document isn't included in any toctree Fix the warning by adding casefolding documentation entry to bcachefs toctree. Fixes: bc5cc09246c5 ("bcachefs: bcachefs_metadata_version_casefolding") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250221161728.32739f85@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- Documentation/filesystems/bcachefs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst index 7db4d7ceab58..0415b5d78192 100644 --- a/Documentation/filesystems/bcachefs/index.rst +++ b/Documentation/filesystems/bcachefs/index.rst @@ -10,4 +10,5 @@ bcachefs Documentation CodingStyle SubmittingPatches + casefolding errorcodes From 93422e0b33edf1cb48e1a94edda6940038fac378 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Mon, 24 Feb 2025 19:40:26 +0700 Subject: [PATCH 097/180] Documentation: bcachefs: Split index toctree bcachefs subsystem currently has 4 docs: two are development notes and the rest are actual filesystem docs. These two groups are clearly distinct and can be organized. Split the toctree into two, one for each docs group. While at it, also reduce :maxdepth: so that only title headings are listed in the toctrees. Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- Documentation/filesystems/bcachefs/index.rst | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst index 0415b5d78192..3864d0ae89c1 100644 --- a/Documentation/filesystems/bcachefs/index.rst +++ b/Documentation/filesystems/bcachefs/index.rst @@ -4,11 +4,28 @@ bcachefs Documentation ====================== +Subsystem-specific development process notes +-------------------------------------------- + +Development notes specific to bcachefs. These are intended to supplement +:doc:`general kernel development handbook `. + .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :numbered: CodingStyle SubmittingPatches + +Filesystem implementation +------------------------- + +Documentation for filesystem features and their implementation details. +At this moment, only a few of these are described here. + +.. toctree:: + :maxdepth: 1 + :numbered: + casefolding errorcodes From 76d6305dca7e8fc3947d5b6ce31dc6e307de605f Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Mon, 24 Feb 2025 19:40:27 +0700 Subject: [PATCH 098/180] Documentation: bcachefs: SubmittingPatches: Demote section headings SubmttingPatches.rst has 4 section headings, all under the same heading levels. In absence of title headings, these section headings are all ended up as title headings in the docs output, which also affect the index toctree (increasing titles to 6 from the original 2) due to :numbered: option. Demote second-to-last section headings, making "Submitting patches to bcachefs" as title heading. Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- .../bcachefs/SubmittingPatches.rst | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst index 026b12ae0d6a..ece0e85d2598 100644 --- a/Documentation/filesystems/bcachefs/SubmittingPatches.rst +++ b/Documentation/filesystems/bcachefs/SubmittingPatches.rst @@ -1,5 +1,10 @@ -Submitting patches to bcachefs: -=============================== +Submitting patches to bcachefs +============================== + +Here are suggestions for submitting patches to bcachefs subsystem. + +Submission checklist +-------------------- Patches must be tested before being submitted, either with the xfstests suite [0], or the full bcachefs test suite in ktest [1], depending on what's being @@ -26,8 +31,8 @@ considered out of date), but try not to deviate too much without reason. Focus on writing code that reads well and is organized well; code should be aesthetically pleasing. -CI: -=== +CI +-- Instead of running your tests locally, when running the full test suite it's prefereable to let a server farm do it in parallel, and then have the results @@ -39,8 +44,8 @@ a big tech company, you'll need to help out with server costs to get access - but the CI is not restricted to running bcachefs tests: it runs any ktest test (which generally makes it easy to wrap other tests that can run in qemu). -Other things to think about: -============================ +Other things to think about +--------------------------- - How will we debug this code? Is there sufficient introspection to diagnose when something starts acting wonky on a user machine? @@ -79,8 +84,8 @@ Other things to think about: tested? (Automated tests exists but aren't in the CI, due to the hassle of disk image management; coordinate to have them run.) -Mailing list, IRC: -================== +Mailing list, IRC +----------------- Patches should hit the list [3], but much discussion and code review happens on IRC as well [4]; many people appreciate the more conversational approach and From a42d685ff26362855c068345b803c6f514ac4c0f Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Mon, 24 Feb 2025 19:40:28 +0700 Subject: [PATCH 099/180] Documentation: bcachefs: SubmittingPatches: Convert footnotes to reST syntax Footnotes list are outputted in htmldocs simply as long-running paragraph instead. Use reST numbered footnotes syntax for the job. Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- .../bcachefs/SubmittingPatches.rst | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst index ece0e85d2598..a455f9cfd15c 100644 --- a/Documentation/filesystems/bcachefs/SubmittingPatches.rst +++ b/Documentation/filesystems/bcachefs/SubmittingPatches.rst @@ -7,7 +7,7 @@ Submission checklist -------------------- Patches must be tested before being submitted, either with the xfstests suite -[0], or the full bcachefs test suite in ktest [1], depending on what's being +[0]_, or the full bcachefs test suite in ktest [1]_, depending on what's being touched. Note that ktest wraps xfstests and will be an easier method to running it for most users; it includes single-command wrappers for all the mainstream in-kernel local filesystems. @@ -39,7 +39,7 @@ prefereable to let a server farm do it in parallel, and then have the results in a nice test dashboard (which can tell you which failures are new, and presents results in a git log view, avoiding the need for most bisecting). -That exists [2], and community members may request an account. If you work for +That exists [2]_, and community members may request an account. If you work for a big tech company, you'll need to help out with server costs to get access - but the CI is not restricted to running bcachefs tests: it runs any ktest test (which generally makes it easy to wrap other tests that can run in qemu). @@ -87,17 +87,19 @@ Other things to think about Mailing list, IRC ----------------- -Patches should hit the list [3], but much discussion and code review happens on -IRC as well [4]; many people appreciate the more conversational approach and -quicker feedback. +Patches should hit the list [3]_, but much discussion and code review happens +on IRC as well [4]_; many people appreciate the more conversational approach +and quicker feedback. Additionally, we have a lively user community doing excellent QA work, which exists primarily on IRC. Please make use of that resource; user feedback is important for any nontrivial feature, and documenting it in commit messages would be a good idea. -[0]: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git -[1]: https://evilpiepirate.org/git/ktest.git/ -[2]: https://evilpiepirate.org/~testdashboard/ci/ -[3]: linux-bcachefs@vger.kernel.org -[4]: irc.oftc.net#bcache, #bcachefs-dev +.. rubric:: References + +.. [0] git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git +.. [1] https://evilpiepirate.org/git/ktest.git/ +.. [2] https://evilpiepirate.org/~testdashboard/ci/ +.. [3] linux-bcachefs@vger.kernel.org +.. [4] irc.oftc.net#bcache, #bcachefs-dev From fb195fa7538fe1150a4c6033ac8a3ed90f1ba69e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Mar 2025 14:20:58 -0400 Subject: [PATCH 100/180] bcachefs: BCH_SB_FEATURES_ALL includes BCH_FEATURE_incompat_verison_field These features are set on format and incompat upgarde. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/super-io.c | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 13cc0833b488..8114ad9a3fe6 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -927,7 +927,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u BIT_ULL(BCH_FEATURE_new_siphash)| \ BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ BIT_ULL(BCH_FEATURE_new_varint)| \ - BIT_ULL(BCH_FEATURE_journal_no_flush)) + BIT_ULL(BCH_FEATURE_journal_no_flush)| \ + BIT_ULL(BCH_FEATURE_incompat_version_field)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 5bd7bb90ee48..7e726b3dc6f4 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1214,12 +1214,11 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) bch2_sb_field_resize(&c->disk_sb, downgrade, 0); c->disk_sb.sb->version = cpu_to_le16(new_version); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); if (incompat) { + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); } } From bafd41b435afebe40ce931cc4599e5aa330788f3 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 10 Mar 2025 20:20:29 +0100 Subject: [PATCH 101/180] bcachefs: Fix error type in bch2_alloc_v3_validate() Use error type alloc_v3_unpack_error in bch2_alloc_v3_validate(). Fixes: b65db750e2bb ("bcachefs: Enumerate fsck errors") Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index ecad4a78c3f7..4dfcf3e6fffd 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, int ret = 0; bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), - c, alloc_v2_unpack_error, + c, alloc_v3_unpack_error, "unpack error"); fsck_err: return ret; From 6422bf8117cc2a8922b908a2634c01f4a2cd1818 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Feb 2025 18:59:58 -0500 Subject: [PATCH 102/180] bcachefs: bch2_request_incompat_feature() now returns error code For future usage, we'll want a dedicated error code for better debugging. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 + fs/bcachefs/fs-ioctl.c | 5 +++-- fs/bcachefs/reflink.c | 2 +- fs/bcachefs/super-io.c | 10 ++++++---- fs/bcachefs/super-io.h | 8 ++++---- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 20bfdee42309..9e19bc37aa72 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -206,6 +206,7 @@ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ x(EINVAL, varint_decode_error) \ + x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 4465a2a821e3..17c035f9d629 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -69,8 +69,9 @@ static int bch2_inode_flags_set(struct btree_trans *trans, if (ret < 0) return ret; - if (!bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding)) - return -EOPNOTSUPP; + ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding); + if (ret) + return ret; bch2_check_set_feature(c, BCH_FEATURE_casefolding); #else diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 50118661e64b..68172c6eba21 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -606,7 +606,7 @@ s64 bch2_remap_range(struct bch_fs *c, u64 dst_done = 0; u32 dst_snapshot, src_snapshot; bool reflink_p_may_update_opts_field = - bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); + !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 7e726b3dc6f4..7bd2d3d84295 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -69,12 +69,14 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta return v; } -bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) +int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) { - bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && - version <= c->sb.version_incompat_allowed; + int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && + version <= c->sb.version_incompat_allowed) + ? 0 + : -BCH_ERR_may_not_use_incompat_feature; - if (ret) { + if (!ret) { mutex_lock(&c->sb_lock); SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index b4cff9ebdebb..167dd98f893e 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -21,13 +21,13 @@ static inline bool bch2_version_compatible(u16 version) void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); -bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); +int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); -static inline bool bch2_request_incompat_feature(struct bch_fs *c, - enum bcachefs_metadata_version version) +static inline int bch2_request_incompat_feature(struct bch_fs *c, + enum bcachefs_metadata_version version) { return likely(version <= c->sb.version_incompat) - ? true + ? 0 : bch2_set_version_incompat(c, version); } From 4a90675cfe18acbbfe1c3a9c2ef682f5976478ab Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 24 Feb 2025 20:29:58 -0500 Subject: [PATCH 103/180] bcachefs: bcachefs_metadata_version_extent_flags This implements a new extent field bitflags that apply to the whole extent. There's been a couple things we've wanted this for in the past, but the immediate need is extent poisoning, to solve a rebalance issue. Unknown extent fields can't be parsed (we won't known their size, so we can't advance to the next field), so this is an incompat feature, and using it prevents the filesystem from being mounted by old versions. This also adds the BCH_EXTENT_poisoned flag; this indicates that the data is known to be bad (i.e. there was a checksum error, and we had to write a new checksum) and reads will return errors. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/errcode.h | 1 + fs/bcachefs/extents.c | 43 +++++++++++++++++++++++++++++++++- fs/bcachefs/extents.h | 15 ++++++++++++ fs/bcachefs/extents_format.h | 24 +++++++++++++++++-- fs/bcachefs/sb-errors_format.h | 3 ++- 6 files changed, 84 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 8114ad9a3fe6..a6cc817ccd87 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -690,7 +690,8 @@ struct bch_sb_field_ext { x(cached_backpointers, BCH_VERSION(1, 21)) \ x(stripe_backpointers, BCH_VERSION(1, 22)) \ x(stripe_lru, BCH_VERSION(1, 23)) \ - x(casefolding, BCH_VERSION(1, 24)) + x(casefolding, BCH_VERSION(1, 24)) \ + x(extent_flags, BCH_VERSION(1, 25)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 9e19bc37aa72..0d9a8198e95e 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -271,6 +271,7 @@ x(EIO, mark_stripe) \ x(EIO, stripe_reconstruct) \ x(EIO, key_type_error) \ + x(EIO, extent_poisened) \ x(EIO, no_device_to_read_from) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index ec653109de5b..d9bdf433c118 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -28,6 +28,13 @@ #include "trace.h" #include "util.h" +static const char * const bch2_extent_flags_strs[] = { +#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, + BCH_EXTENT_FLAGS() +#undef x + NULL, +}; + static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -127,6 +134,9 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return -BCH_ERR_extent_poisened; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { /* @@ -1225,6 +1235,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, bch2_extent_rebalance_to_text(out, c, &entry->rebalance); break; + case BCH_EXTENT_ENTRY_flags: + prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); + break; + default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; @@ -1386,6 +1400,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, #endif break; } + case BCH_EXTENT_ENTRY_flags: + bkey_fsck_err_on(entry != ptrs.start, + c, extent_flags_not_at_start, + "extent flags entry not at start"); + break; } } @@ -1452,6 +1471,28 @@ void bch2_ptr_swab(struct bkey_s k) } } +int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags) +{ + int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); + if (ret) + return ret; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + if (ptrs.start != ptrs.end && + extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) { + ptrs.start->flags.flags = flags; + } else { + struct bch_extent_flags f = { + .type = BIT(BCH_EXTENT_ENTRY_flags), + .flags = flags, + }; + __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f); + } + + return 0; +} + /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) @@ -1497,8 +1538,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: - break; case BCH_EXTENT_ENTRY_rebalance: + case BCH_EXTENT_ENTRY_flags: break; } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index ed160aaa9546..c50c4f353bab 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -753,4 +753,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size) k->size = new_size; } +static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs) +{ + if (ptrs.start != ptrs.end && + extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) + return ptrs.start->flags.flags; + return 0; +} + +static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k) +{ + return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k)); +} + +int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64); + #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index c198dfc376d6..74c0252cbd98 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -79,8 +79,9 @@ x(crc64, 2) \ x(crc128, 3) \ x(stripe_ptr, 4) \ - x(rebalance, 5) -#define BCH_EXTENT_ENTRY_MAX 6 + x(rebalance, 5) \ + x(flags, 6) +#define BCH_EXTENT_ENTRY_MAX 7 enum bch_extent_entry_type { #define x(f, n) BCH_EXTENT_ENTRY_##f = n, @@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr { #endif }; +#define BCH_EXTENT_FLAGS() \ + x(poisoned, 0) + +enum bch_extent_flags_e { +#define x(n, v) BCH_EXTENT_FLAG_##n = v, + BCH_EXTENT_FLAGS() +#undef x +}; + +struct bch_extent_flags { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:7, + flags:57; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 flags:57, + type:7; +#endif +}; + /* bch_extent_rebalance: */ #include "rebalance_format.h" diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index cdafd877b8a1..67455beb8358 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -179,6 +179,7 @@ enum bch_fsck_flags { x(ptr_crc_redundant, 160, 0) \ x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ + x(extent_flags_not_at_start, 306, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ x(reflink_v_pos_bad, 292, 0) \ @@ -316,7 +317,7 @@ enum bch_fsck_flags { x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ x(dirent_cf_name_too_big, 304, 0) \ x(dirent_stray_data_after_cf_name, 305, 0) \ - x(MAX, 306, 0) + x(MAX, 307, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, From fba513a9ee2fb8d4acc875a7de6d93f283cfc103 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 1 Mar 2025 15:46:59 -0500 Subject: [PATCH 104/180] bcachefs: give bch2_write_super() a proper error code Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 7bd2d3d84295..9a204baa3ab9 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1159,7 +1159,7 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written), c, ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) - ret = -1; + ret = -BCH_ERR_erofs_sb_err; out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); From 7bc580816869e31c121eefe26e7eaccd4e3b778b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Feb 2025 11:37:36 -0500 Subject: [PATCH 105/180] bcachefs: data_update now checks for extents that can't be moved If a device is ro or failed, we might not have anywhere to move a replica. Check for this early, before doing the read and attempting to write. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 27 ++++++++++++++++++++++++++- fs/bcachefs/errcode.h | 1 + 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 7e484afea551..522574bc4197 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -573,7 +573,6 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, prt_str_indented(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); - prt_newline(out); } void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) @@ -707,6 +706,18 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, return 0; } +static bool can_write_extent(struct bch_fs *c, + struct bch_devs_list *devs_have, + unsigned target) +{ + struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); + + darray_for_each(*devs_have, i) + __clear_bit(*i, devs.d); + + return !bch2_is_zero(&devs, sizeof(devs)); +} + int bch2_data_update_init(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, @@ -788,6 +799,20 @@ int bch2_data_update_init(struct btree_trans *trans, ptr_bit <<= 1; } + if (!can_write_extent(c, &m->op.devs_have, + m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) { + /* + * Check if we have rw devices not in devs_have: this can happen + * if we're trying to move data on a ro or failed device + * + * If we can't move it, we need to clear the rebalance_work bit, + * if applicable + * + * Also, copygc should skip ro/failed devices: + */ + return -BCH_ERR_data_update_done_no_rw_devs; + } + unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); /* diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 0d9a8198e95e..ed4214e9beba 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -186,6 +186,7 @@ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ + x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ x(EINVAL, mismatched_block_size) \ From 3480aecd5f4d65fffd775929d1de7349fa6b95c1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Feb 2025 13:59:15 -0500 Subject: [PATCH 106/180] bcachefs: Fix read path io_ref handling We were using our device pointer after we'd released our ref to it. Unlikely to be a race that's practical to hit, since actually removing a member device is a whole process besides just taking it offline, but - needs to be fixed. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index dcd5a2aee0f1..f97716e52556 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -375,6 +375,11 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { BUG_ON(rbio->bounce && !rbio->split); + if (rbio->have_ioref) { + struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); + percpu_ref_put(&ca->io_ref); + } + if (rbio->split) { struct bch_read_bio *parent = rbio->parent; @@ -790,10 +795,8 @@ static void bch2_read_endio(struct bio *bio) struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; - if (rbio->have_ioref) { + if (ca) bch2_latency_acct(ca, rbio->submit_time, READ); - percpu_ref_put(&ca->io_ref); - } if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; From 3526bca36b31731eb468cddaa3ad0f6ebc5d7520 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Feb 2025 14:07:22 -0500 Subject: [PATCH 107/180] bcachefs: bch2_account_io_completion() We need to start accounting successes for every IO, not just failures, so introduce a unified hook for io completion accounting and convert io_read.c. Signed-off-by: Kent Overstreet --- fs/bcachefs/error.h | 20 +++++++++++++++++++ fs/bcachefs/io_read.c | 45 +++++++++++++++++++++--------------------- fs/bcachefs/io_write.h | 6 ------ 3 files changed, 43 insertions(+), 28 deletions(-) diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index b3cc69f29fd9..e055b606fb6f 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -216,6 +216,26 @@ void bch2_io_error_work(struct work_struct *); /* Does the error handling without logging a message */ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif + +static inline void bch2_account_io_completion(struct bch_dev *ca, + enum bch_member_error_type type, + u64 submit_time, bool success) +{ + if (unlikely(!ca)) + return; + + if (type != BCH_MEMBER_ERROR_checksum) + bch2_latency_acct(ca, submit_time, type); + + if (!success) + bch2_io_error(ca, type); +} + #define bch2_dev_io_err_on(cond, ca, _type, ...) \ ({ \ bool _ret = (cond); \ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index f97716e52556..70e5c5a32d01 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -524,12 +524,10 @@ static void bch2_read_io_err(struct work_struct *work) bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); - if (ca) { - bch2_io_error(ca, BCH_MEMBER_ERROR_read); + if (ca) bch_err_ratelimited(ca, "%s", buf.buf); - } else { + else bch_err_ratelimited(c, "%s", buf.buf); - } printbuf_exit(&buf); bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); @@ -614,12 +612,10 @@ static void bch2_read_csum_err(struct work_struct *work) bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) { - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + if (ca) bch_err_ratelimited(ca, "%s", buf.buf); - } else { + else bch_err_ratelimited(c, "%s", buf.buf); - } bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); printbuf_exit(&buf); @@ -671,6 +667,7 @@ static void __bch2_read_endio(struct work_struct *work) struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; struct bio *src = &rbio->bio; struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; @@ -692,7 +689,22 @@ static void __bch2_read_endio(struct work_struct *work) } csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) + bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; + + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { + rbio->flags |= BCH_READ_must_bounce; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + goto out; + } + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) goto csum_err; /* @@ -765,17 +777,6 @@ static void __bch2_read_endio(struct work_struct *work) memalloc_nofs_restore(nofs_flags); return; csum_err: - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { - rbio->flags |= BCH_READ_must_bounce; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - goto out; - } - bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decompression_err: @@ -795,8 +796,8 @@ static void bch2_read_endio(struct bio *bio) struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; - if (ca) - bch2_latency_acct(ca, rbio->submit_time, READ); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rbio->submit_time, !bio->bi_status); if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index bf942566a8eb..627730537752 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -11,12 +11,6 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); From b31c070407edda710ad087d143353ddd0f2c9499 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Feb 2025 14:38:47 -0500 Subject: [PATCH 108/180] bcachefs: Finish bch2_account_io_completion() conversions More prep work for automatically kicking devices out after too many IO errors. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 38 +++++++++++++++++------------ fs/bcachefs/btree_node_scan.c | 12 ++++++--- fs/bcachefs/ec.c | 15 +++++++----- fs/bcachefs/errcode.h | 1 + fs/bcachefs/error.h | 33 +++++++------------------ fs/bcachefs/io_write.c | 12 +++++---- fs/bcachefs/journal_io.c | 46 +++++++++++++++++++++++------------ fs/bcachefs/journal_types.h | 1 + fs/bcachefs/super-io.c | 12 ++++----- 9 files changed, 95 insertions(+), 75 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 18413b4f22a3..cd792fee7ee3 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1329,6 +1329,7 @@ static void btree_node_read_work(struct work_struct *work) bch_info(c, "retrying read"); ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); rb->have_ioref = ca != NULL; + rb->start_time = local_clock(); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_buf_bytes(b); @@ -1339,12 +1340,17 @@ static void btree_node_read_work(struct work_struct *work) } else { bio->bi_status = BLK_STS_REMOVED; } + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); start: printbuf_reset(&buf); bch2_btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read, - "btree read error %s for %s", - bch2_blk_status_to_str(bio->bi_status), buf.buf); + + if (ca && bio->bi_status) + bch_err_dev_ratelimited(ca, + "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; @@ -1401,12 +1407,11 @@ static void btree_node_read_endio(struct bio *bio) struct btree_read_bio *rb = container_of(bio, struct btree_read_bio, bio); struct bch_fs *c = rb->c; + struct bch_dev *ca = rb->have_ioref + ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - if (rb->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); - - bch2_latency_acct(ca, rb->start_time, READ); - } + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); queue_work(c->btree_read_complete_wq, &rb->work); } @@ -2126,16 +2131,17 @@ static void btree_node_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct btree *b = wbio->bio.bi_private; struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; - unsigned long flags; - if (wbio->have_ioref) - bch2_latency_acct(ca, wbio->submit_time, WRITE); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); - if (!ca || - bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, - "btree write error: %s", - bch2_blk_status_to_str(bio->bi_status)) || - bch2_meta_write_fault("btree")) { + if (ca && bio->bi_status) + bch_err_dev_ratelimited(ca, + "btree write error: %s", + bch2_blk_status_to_str(bio->bi_status)); + + if (bio->bi_status) { + unsigned long flags; spin_lock_irqsave(&c->btree_write_error_lock, flags); bch2_dev_list_add_dev(&orig->failed, wbio->dev); spin_unlock_irqrestore(&c->btree_write_error_lock, flags); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index a7f06deee13c..fb73ec77c099 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -166,11 +166,17 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, bn, PAGE_SIZE); + u64 submit_time = local_clock(); submit_bio_wait(bio); - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, - "IO error in try_read_btree_node() at %llu: %s", - offset, bch2_blk_status_to_str(bio->bi_status))) + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, + "IO error in try_read_btree_node() at %llu: %s", + offset, bch2_blk_status_to_str(bio->bi_status)); return; + } if (le64_to_cpu(bn->magic) != bset_magic(c)) return; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 1090cdb7d5cc..8c7a9addafae 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -105,6 +105,7 @@ struct ec_bio { struct bch_dev *ca; struct ec_stripe_buf *buf; size_t idx; + u64 submit_time; struct bio bio; }; @@ -748,14 +749,15 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, - bio_data_dir(bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "erasure coding %s error: %s", + bch2_account_io_completion(ca, bio_data_dir(bio), + ec_bio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "erasure coding %s error: %s", str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status)); clear_bit(ec_bio->idx, ec_bio->buf->valid); + } int stale = dev_ptr_stale(ca, ptr); if (stale) { @@ -818,6 +820,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ec_bio->ca = ca; ec_bio->buf = buf; ec_bio->idx = idx; + ec_bio->submit_time = local_clock(); ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ec_bio->bio.bi_end_io = ec_block_endio; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index ed4214e9beba..d45ef03abc91 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -279,6 +279,7 @@ x(EIO, no_encryption_key) \ x(EIO, insufficient_journal_devices) \ x(EIO, device_offline) \ + x(EIO, EIO_fault_injected) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index e055b606fb6f..a57b9f18d060 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -222,6 +222,14 @@ void bch2_latency_acct(struct bch_dev *, u64, int); static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} #endif +static inline void bch2_account_io_success_fail(struct bch_dev *ca, + enum bch_member_error_type type, + bool success) +{ + if (!success) + bch2_io_error(ca, type); +} + static inline void bch2_account_io_completion(struct bch_dev *ca, enum bch_member_error_type type, u64 submit_time, bool success) @@ -232,32 +240,9 @@ static inline void bch2_account_io_completion(struct bch_dev *ca, if (type != BCH_MEMBER_ERROR_checksum) bch2_latency_acct(ca, submit_time, type); - if (!success) - bch2_io_error(ca, type); + bch2_account_io_success_fail(ca, type, success); } -#define bch2_dev_io_err_on(cond, ca, _type, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) { \ - bch_err_dev_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca, _type); \ - } \ - _ret; \ -}) - -#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) { \ - bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca, _type); \ - } \ - _ret; \ -}) - int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 738bdbfbdb14..dbfcb28f003d 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -716,11 +716,15 @@ static void bch2_write_endio(struct bio *bio) ? bch2_dev_have_ref(c, wbio->dev) : NULL; - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_inum_offset_ratelimited(ca, op->pos.inode, wbio->inode_offset << 9, "data write error: %s", - bch2_blk_status_to_str(bio->bi_status))) { + bch2_blk_status_to_str(bio->bi_status)); set_bit(wbio->dev, op->failed.d); op->flags |= BCH_WRITE_io_error; } @@ -732,10 +736,8 @@ static void bch2_write_endio(struct bio *bio) set_bit(wbio->dev, op->devs_need_flush->d); } - if (wbio->have_ioref) { - bch2_latency_acct(ca, wbio->submit_time, WRITE); + if (wbio->have_ioref) percpu_ref_put(&ca->io_ref); - } if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7d59ccc07315..c12d9f9bd536 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1041,13 +1041,19 @@ static int journal_read_bucket(struct bch_dev *ca, bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, buf->data, sectors_read << 9); + u64 submit_time = local_clock(); ret = submit_bio_wait(bio); kfree(bio); - if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, - "journal read error: sector %llu", - offset) || - bch2_meta_read_fault("journal")) { + if (!ret && bch2_meta_read_fault("journal")) + ret = -BCH_ERR_EIO_fault_injected; + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + submit_time, !ret); + + if (ret) { + bch_err_dev_ratelimited(ca, + "journal read error: sector %llu", offset); /* * We don't error out of the recovery process * here, since the relevant journal entry may be @@ -1110,13 +1116,16 @@ static int journal_read_bucket(struct bch_dev *ca, struct bch_csum csum; csum_good = jset_csum_good(c, j, &csum); - if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, - "%s", - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf))) + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) { + bch_err_dev_ratelimited(ca, "%s", + (printbuf_reset(&err), + prt_str(&err, "journal "), + bch2_csum_err_msg(&err, csum_type, j->csum, csum), + err.buf)); saw_bad = true; + } ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, @@ -1727,13 +1736,16 @@ static void journal_write_endio(struct bio *bio) struct journal *j = &ca->fs->journal; struct journal_buf *w = j->buf + jbio->buf_idx; - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + jbio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), - bch2_blk_status_to_str(bio->bi_status)) || - bch2_meta_write_fault("journal")) { - unsigned long flags; + bch2_blk_status_to_str(bio->bi_status)); + unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); @@ -1762,7 +1774,11 @@ static CLOSURE_CALLBACK(journal_write_submit) sectors); struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->bio[w->idx]->bio; + struct journal_bio *jbio = ja->bio[w->idx]; + struct bio *bio = &jbio->bio; + + jbio->submit_time = local_clock(); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index a0b17c6ed83e..fd82f5d80355 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -175,6 +175,7 @@ typedef DARRAY(u64) darray_u64; struct journal_bio { struct bch_dev *ca; unsigned buf_idx; + u64 submit_time; struct bio bio; }; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 9a204baa3ab9..2fef285cfc1a 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -911,16 +911,16 @@ static void write_super_endio(struct bio *bio) { struct bch_dev *ca = bio->bi_private; + bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status); + /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, - bio_data_dir(bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "superblock %s error: %s", + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "superblock %s error: %s", str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status)); ca->sb_write_error = 1; + } closure_put(&ca->fs->sb_write); percpu_ref_put(&ca->io_ref); From 13fd6be102f75de25099757718c1de46fa57ae7a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 25 Feb 2025 18:58:46 -0500 Subject: [PATCH 109/180] bcachefs: Stash a pointer to the filesystem for blk_holder_ops Note that we open block devices before we allocate bch_fs, but once attached to a filesystem they will be closed before the bch_fs is torn down - so stashing a pointer without a refcount looks incorrect but it's not. Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 2 +- fs/bcachefs/super.c | 7 +++++++ fs/bcachefs/super_types.h | 8 +++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 2fef285cfc1a..74f1e45980db 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -748,7 +748,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, memset(sb, 0, sizeof(*sb)); sb->mode = BLK_OPEN_READ; sb->have_bio = true; - sb->holder = kmalloc(1, GFP_KERNEL); + sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL); if (!sb->holder) return -ENOMEM; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 11877aea38ec..4d656139561c 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1431,6 +1431,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); + /* + * Stash pointer to the filesystem for blk_holder_ops - note that once + * attached to a filesystem, we will always close the block device + * before tearing down the filesystem object. + */ + ca->disk_sb.holder->c = ca->fs; + ca->dev = ca->disk_sb.bdev->bd_dev; percpu_ref_reinit(&ca->io_ref); diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 368a63d938cf..3a899f799d1d 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -2,13 +2,19 @@ #ifndef _BCACHEFS_SUPER_TYPES_H #define _BCACHEFS_SUPER_TYPES_H +struct bch_fs; + +struct bch_sb_handle_holder { + struct bch_fs *c; +}; + struct bch_sb_handle { struct bch_sb *sb; struct file *s_bdev_file; struct block_device *bdev; char *sb_name; struct bio *bio; - void *holder; + struct bch_sb_handle_holder *holder; size_t buffer_size; blk_mode_t mode; unsigned have_layout:1; From 1fdbe0b184c822191e9385fac8d8695a9e583ec7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 25 Feb 2025 22:14:06 -0500 Subject: [PATCH 110/180] bcachefs: Make sure c->vfs_sb is set before starting fs This is necessary for the new blk_holder_ops, which want the vfs super_block available for synchronization. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 2c011a465588..459ca8259fc0 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2218,9 +2218,10 @@ static int bch2_fs_get_tree(struct fs_context *fc) bch2_opts_apply(&c->opts, opts); - ret = bch2_fs_start(c); - if (ret) - goto err_stop_fs; + /* + * need to initialise sb and set c->vfs_sb _before_ starting fs, + * for blk_holder_ops + */ sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); ret = PTR_ERR_OR_ZERO(sb); @@ -2282,6 +2283,10 @@ static int bch2_fs_get_tree(struct fs_context *fc) sb->s_shrink->seeks = 0; + ret = bch2_fs_start(c); + if (ret) + goto err_put_super; + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); bch_err_msg(c, ret, "mounting: error getting root inode"); From d5308203a85e016e9ceb3e38742a1634c77a7706 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 25 Feb 2025 18:50:38 -0500 Subject: [PATCH 111/180] bcachefs: Implement blk_holder_ops We can't use the standard fs_holder_ops because they're meant for single device filesystems - fs_bdev_mark_dead() in particular - and they assume that the blk_holder is the super_block, which also doesn't work for a multi device filesystem. These generally follow the standard fs_holder_ops; the locking/refcounting is a bit simplified because c->ro_ref suffices, and bch2_fs_bdev_mark_dead() is not necessarily shutting down the entire filesystem. Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 3 -- fs/bcachefs/super.c | 97 ++++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/super.h | 2 + 3 files changed, 99 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 74f1e45980db..918e4e7704dd 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -25,9 +25,6 @@ #include #include -static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { -}; - struct bch2_metadata_version { u16 version; const char *name; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 4d656139561c..78a8daa80fcc 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1075,6 +1075,7 @@ int bch2_fs_start(struct bch_fs *c) } set_bit(BCH_FS_started, &c->flags); + wake_up(&c->ro_ref_wait); if (c->opts.read_only) { bch2_fs_read_only(c); @@ -2024,6 +2025,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } +/* blk_holder_ops: */ + +static struct bch_fs *bdev_get_fs(struct block_device *bdev) + __releases(&bdev->bd_holder_lock) +{ + struct bch_sb_handle_holder *holder = bdev->bd_holder; + struct bch_fs *c = holder->c; + + if (c && !bch2_ro_ref_tryget(c)) + c = NULL; + + mutex_unlock(&bdev->bd_holder_lock); + + if (c) + wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); + return c; +} + +/* returns with ref on ca->ref */ +static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) +{ + for_each_member_device(c, ca) + if (ca->disk_sb.bdev == bdev) + return ca; + return NULL; +} + +static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) +{ + struct bch_fs *c = bdev_get_fs(bdev); + if (!c) + return; + + struct super_block *sb = c->vfs_sb; + if (sb) { + /* + * Not necessary, c->ro_ref guards against the filesystem being + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ + down_read(&sb->s_umount); + } + + down_write(&c->state_lock); + struct bch_dev *ca = bdev_to_bch_dev(c, bdev); + if (!ca) + goto unlock; + + if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) { + __bch2_dev_offline(c, ca); + } else { + if (sb) { + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + evict_inodes(sb); + } + + bch2_journal_flush(&c->journal); + bch2_fs_emergency_read_only(c); + } + + bch2_dev_put(ca); +unlock: + if (sb) + up_read(&sb->s_umount); + up_write(&c->state_lock); + bch2_ro_ref_put(c); +} + +static void bch2_fs_bdev_sync(struct block_device *bdev) +{ + struct bch_fs *c = bdev_get_fs(bdev); + if (!c) + return; + + struct super_block *sb = c->vfs_sb; + if (sb) { + /* + * Not necessary, c->ro_ref guards against the filesystem being + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ + down_read(&sb->s_umount); + sync_filesystem(sb); + up_read(&sb->s_umount); + } + + bch2_ro_ref_put(c); +} + +const struct blk_holder_ops bch2_sb_handle_bdev_ops = { + .mark_dead = bch2_fs_bdev_mark_dead, + .sync = bch2_fs_bdev_sync, +}; + /* Filesystem open: */ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 04f8287eff5c..23533bce5709 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); +extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; + #endif /* _BCACHEFS_SUPER_H */ From 2efa8397cac3bb18a129054d22ae58b60fbbdd26 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 1 Mar 2025 16:14:28 -0500 Subject: [PATCH 112/180] bcachefs: Fix btree_node_scan io_ref handling This was completely fubar; it's now simplified a bit as well. Note that for_each_online_member() takes and releases io_refs as it iterates, so we need to release that if we break. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_node_scan.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index fb73ec77c099..678161321e42 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -270,7 +270,7 @@ static int read_btree_nodes_worker(void *p) err: bio_put(bio); free_page((unsigned long) buf); - percpu_ref_get(&ca->io_ref); + percpu_ref_put(&ca->io_ref); closure_put(w->cl); kfree(w); return 0; @@ -289,29 +289,28 @@ static int read_btree_nodes(struct find_btree_nodes *f) continue; struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); - struct task_struct *t; - if (!w) { percpu_ref_put(&ca->io_ref); ret = -ENOMEM; goto err; } - percpu_ref_get(&ca->io_ref); - closure_get(&cl); w->cl = &cl; w->f = f; w->ca = ca; - t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { percpu_ref_put(&ca->io_ref); - closure_put(&cl); - f->ret = ret; - bch_err(c, "error starting kthread: %i", ret); + kfree(w); + bch_err_msg(c, ret, "starting kthread"); break; } + + closure_get(&cl); + percpu_ref_get(&ca->io_ref); + wake_up_process(t); } err: closure_sync(&cl); From cf164a91066d9af7db3cfa9ee2ac2e36f692dc5e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 1 Mar 2025 17:34:33 -0500 Subject: [PATCH 113/180] bcachefs: bch2_dev_get_ioref() may now sleep The next patch implementing freezing will change bch2_dev_get_ioref() to sleep if a device is currently frozen. Add an annotation and fix the journal code accordingly. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 5 ++++- fs/bcachefs/sb-members.h | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index c12d9f9bd536..a510755a8364 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1664,6 +1664,7 @@ static CLOSURE_CALLBACK(journal_write_done) } bool completed = false; + bool do_discards = false; for (seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); @@ -1676,7 +1677,6 @@ static CLOSURE_CALLBACK(journal_write_done) j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; - bch2_do_discards(c); closure_wake_up(&c->freelist_wait); bch2_reset_alloc_cursors(c); } @@ -1727,6 +1727,9 @@ static CLOSURE_CALLBACK(journal_write_done) */ bch2_journal_do_writes(j); spin_unlock(&j->lock); + + if (do_discards) + bch2_do_discards(c); } static void journal_write_endio(struct bio *bio) diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index b29b6c6c21dd..df91b02ce575 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -283,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) { + might_sleep(); + rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, dev); if (ca && !percpu_ref_tryget(&ca->io_ref)) From d71e023376d3e56bf2a787c9b5d2600a2db2aabf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Mar 2025 10:50:49 -0500 Subject: [PATCH 114/180] bcachefs: Change BCH_MEMBER_STATE_failed semantics Previously, we woudn't try to read at all from a failed device - that doesn't make much sense, the device may be unhealthy (perhaps taking longer than it should to service reads), but if it's our only option we should still try to read from it. Now, bch2_bkey_pick_read_device() will pick failed devices only if there are no non-failed replicas to read from. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 24 ++++++++++++++++++------ fs/bcachefs/sb-members.h | 2 +- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index d9bdf433c118..032cd0bda017 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -79,12 +79,16 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, } } -static inline u64 dev_latency(struct bch_fs *c, unsigned dev) +static inline u64 dev_latency(struct bch_dev *ca) { - struct bch_dev *ca = bch2_dev_rcu(c, dev); return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; } +static inline int dev_failed(struct bch_dev *ca) +{ + return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; +} + /* * returns true if p1 is better than p2: */ @@ -93,8 +97,16 @@ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p2) { if (likely(!p1.idx && !p2.idx)) { - u64 l1 = dev_latency(c, p1.ptr.dev); - u64 l2 = dev_latency(c, p2.ptr.dev); + struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); + + int failed_delta = dev_failed(ca1) - dev_failed(ca2); + + if (failed_delta) + return failed_delta < 0; + + u64 l1 = dev_latency(ca1); + u64 l2 = dev_latency(ca2); /* * Square the latencies, to bias more in favor of the faster @@ -170,7 +182,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ? f->idx : f->idx + 1; - if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) + if (!p.idx && (!ca || !bch2_dev_is_online(ca))) p.idx++; if (!p.idx && p.has_ec && bch2_force_reconstruct_read) @@ -1012,7 +1024,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); + return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); } void bch2_extent_ptr_set_cached(struct bch_fs *c, diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index df91b02ce575..38261638a611 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -35,7 +35,7 @@ static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) return ret; } -static inline bool bch2_dev_is_readable(struct bch_dev *ca) +static inline bool bch2_dev_is_healthy(struct bch_dev *ca) { return bch2_dev_is_online(ca) && ca->mi.state != BCH_MEMBER_STATE_failed; From 981e3801443f507d74e2dae5710452642c96e8e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 26 Feb 2025 18:44:23 -0500 Subject: [PATCH 115/180] bcachefs: Kick devices out after too many write IO errors We're improving our handling of write errors - we shouldn't write degraded data just because a write failed once, we should retry it (on other devices, if possible). But for this to work, we need to kick devices out when they're only returning errors - otherwise those retries will loop infinitely. This adds a configurable timeout - if writes are failing for too long, we'll set that device read-only. In the future we should also implement more tracking and another knob for an "allowed error rate", so that we can kick out drives that are acting "unhealthy". Another thing we'll want is a mechanism (likely in userspace) for bringing a device back in after a transient error - perhaps a cable was jiggled, or there was a controller reset. After transient errors we also need a mechanism to walk (from the journal) recent btree updates that weren't flushed to that device and treat them as "degraded", since unflushed data may well not have been written. Out of scope for this patch, but becoming relevant. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/bcachefs_format.h | 1 + fs/bcachefs/error.c | 34 +++++++++++++++++++++++++--------- fs/bcachefs/error.h | 7 ++++++- fs/bcachefs/opts.h | 5 +++++ fs/bcachefs/super-io.c | 3 +++ 6 files changed, 41 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index d2c3f59a668f..8abefc994016 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -536,6 +536,7 @@ struct bch_dev { */ struct bch_member_cpu mi; atomic64_t errors[BCH_MEMBER_ERROR_NR]; + unsigned long write_errors_start; __uuid_t uuid; char name[BDEVNAME_SIZE]; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index a6cc817ccd87..7a5b0d211a82 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -860,6 +860,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); +LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 3f93a5a6bbfa..6d68c89a49b2 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); struct bch_fs *c = ca->fs; - bool dev; + + /* XXX: if it's reads or checksums that are failing, set it to failed */ down_write(&c->state_lock); - dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED); - if (dev - ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED) - : bch2_fs_emergency_read_only(c)) + unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); + + if (write_errors_start && + time_after(jiffies, + write_errors_start + c->opts.write_error_timeout * HZ)) { + if (ca->mi.state >= BCH_MEMBER_STATE_ro) + goto out; + + bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); + bch_err(ca, - "too many IO errors, setting %s RO", + "writes erroring for %u seconds, setting %s ro", + c->opts.write_error_timeout, dev ? "device" : "filesystem"); + if (!dev) + bch2_fs_emergency_read_only(c); + + } +out: up_write(&c->state_lock); } void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) { atomic64_inc(&ca->errors[type]); - //queue_work(system_long_wq, &ca->io_error_work); + + if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) + ca->write_errors_start = jiffies; + + queue_work(system_long_wq, &ca->io_error_work); } enum ask_yn { diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index a57b9f18d060..7d3f0e2a5fd6 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -226,8 +226,13 @@ static inline void bch2_account_io_success_fail(struct bch_dev *ca, enum bch_member_error_type type, bool success) { - if (!success) + if (likely(success)) { + if (type == BCH_MEMBER_ERROR_write && + ca->write_errors_start) + ca->write_errors_start = 0; + } else { bch2_io_error(ca, type); + } } static inline void bch2_account_io_completion(struct bch_dev *ca, diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 071a92ec8a14..afb89d318d24 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -145,6 +145,11 @@ enum fsck_err_opts { OPT_STR(bch2_error_actions), \ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ + x(write_error_timeout, u16, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, 300), \ + BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ + NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 918e4e7704dd..ee32d043414a 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -454,6 +454,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); + + if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) + SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); } #ifdef __KERNEL__ From 4b0fac4bed0797c33e0852312e1dbe11baa3fb01 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Mar 2025 12:00:56 -0500 Subject: [PATCH 116/180] bcachefs: journal write path comment Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index a510755a8364..331c9d762439 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1813,6 +1813,10 @@ static CLOSURE_CALLBACK(journal_write_preflush) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* + * Wait for previous journal writes to comelete; they won't necessarily + * be flushed if they're still in flight + */ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { spin_lock(&j->lock); if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { From 039790cfb5c8255cf9f5523017b9eb0006d1df33 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 20:35:08 -0500 Subject: [PATCH 117/180] bcachefs: ec_stripe_delete() uses new stripe lru Convert to the new persistent stripe LRU. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 64 +++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 42 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 8c7a9addafae..dba4b599f827 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1152,37 +1152,22 @@ void bch2_stripes_heap_update(struct bch_fs *c, static int ec_stripe_delete(struct btree_trans *trans, u64 idx) { - struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_stripe s; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_intent); - ret = bkey_err(k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), + BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != KEY_TYPE_stripe) { - bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); - ret = -EINVAL; - goto err; - } - - s = bkey_s_c_to_stripe(k); - for (unsigned i = 0; i < s.v->nr_blocks; i++) - if (stripe_blockcount_get(s.v, i)) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); - printbuf_exit(&buf); - ret = -EINVAL; - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0); + /* + * We expect write buffer races here + * Important: check stripe_is_open with stripe key locked: + */ + if (k.k->type == KEY_TYPE_stripe && + !bch2_stripe_is_open(trans->c, idx) && + stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) + ret = bch2_btree_delete_at(trans, &iter, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1197,21 +1182,16 @@ static void ec_stripe_delete_work(struct work_struct *work) struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - while (1) { - mutex_lock(&c->ec_stripes_heap_lock); - u64 idx = stripe_idx_to_delete(c); - mutex_unlock(&c->ec_stripes_heap_lock); - - if (!idx) - break; - - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - ec_stripe_delete(trans, idx)); - bch_err_fn(c, ret); - if (ret) - break; - } - + bch2_trans_run(c, + bch2_btree_write_buffer_tryflush(trans) ?: + for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), + 0, lru_k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + ec_stripe_delete(trans, lru_k.k->p.offset); + }))); bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } From 6c336144b9a1b671fccd4d90f1cfb5e9a5398bfa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 20:34:47 -0500 Subject: [PATCH 118/180] bcachefs: get_existing_stripe() uses new stripe lru Convert to the new persistent stripe LRU. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 86 +++++++++++++++++++++++++++--------------------- fs/bcachefs/ec.h | 10 +++--- 2 files changed, 52 insertions(+), 44 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index dba4b599f827..84f232f4cbf8 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1981,39 +1981,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, return 0; } -static s64 get_existing_stripe(struct bch_fs *c, - struct ec_stripe_head *head) +static int __get_existing_stripe(struct btree_trans *trans, + struct ec_stripe_head *head, + struct ec_stripe_buf *stripe, + u64 idx) { - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t heap_idx; - u64 stripe_idx; - s64 ret = -1; + struct bch_fs *c = trans->c; - if (may_create_new_stripe(c)) - return -1; + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), 0); + int ret = bkey_err(k); + if (ret) + goto err; - mutex_lock(&c->ec_stripes_heap_lock); - for (heap_idx = 0; heap_idx < h->nr; heap_idx++) { - /* No blocks worth reusing, stripe will just be deleted: */ - if (!h->data[heap_idx].blocks_nonempty) - continue; + /* We expect write buffer races here */ + if (k.k->type != KEY_TYPE_stripe) + goto out; - stripe_idx = h->data[heap_idx].idx; + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + if (stripe_lru_pos(s.v) <= 1) + goto out; - m = genradix_ptr(&c->stripes, stripe_idx); - - if (m->disk_label == head->disk_label && - m->algorithm == head->algo && - m->nr_redundant == head->redundancy && - m->sectors == head->blocksize && - m->blocks_nonempty < m->nr_blocks - m->nr_redundant && - bch2_try_open_stripe(c, head->s, stripe_idx)) { - ret = stripe_idx; - break; - } + if (s.v->disk_label == head->disk_label && + s.v->algorithm == head->algo && + s.v->nr_redundant == head->redundancy && + le16_to_cpu(s.v->sectors) == head->blocksize && + bch2_try_open_stripe(c, head->s, idx)) { + bkey_reassemble(&stripe->key, k); + ret = 1; } - mutex_unlock(&c->ec_stripes_heap_lock); +out: + bch2_set_btree_iter_dontneed(&iter); +err: + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2065,24 +2066,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri struct ec_stripe_new *s) { struct bch_fs *c = trans->c; - s64 idx; - int ret; /* * If we can't allocate a new stripe, and there's no stripes with empty * blocks for us to reuse, that means we have to wait on copygc: */ - idx = get_existing_stripe(c, h); - if (idx < 0) - return -BCH_ERR_stripe_alloc_blocked; + if (may_create_new_stripe(c)) + return -1; - ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); - bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, - "reading stripe key: %s", bch2_err_str(ret)); - if (ret) { - bch2_stripe_close(c, s); - return ret; + struct btree_iter lru_iter; + struct bkey_s_c lru_k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), + 0, lru_k, ret) { + ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); + if (ret) + break; } + bch2_trans_iter_exit(trans, &lru_iter); + if (!ret) + ret = -BCH_ERR_stripe_alloc_blocked; + if (ret == 1) + ret = 0; + if (ret) + return ret; return init_new_stripe_from_existing(c, s); } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index cd1c837e4933..3008d41db12d 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -99,15 +99,13 @@ static inline u64 stripe_lru_pos(const struct bch_stripe *s) if (!s) return 0; - unsigned blocks_empty = 0, blocks_nonempty = 0; + unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0; - for (unsigned i = 0; i < s->nr_blocks; i++) { - blocks_empty += !stripe_blockcount_get(s, i); - blocks_nonempty += !!stripe_blockcount_get(s, i); - } + for (unsigned i = 0; i < nr_data; i++) + blocks_empty += !stripe_blockcount_get(s, i); /* Will be picked up by the stripe_delete worker */ - if (!blocks_nonempty) + if (blocks_empty == nr_data) return STRIPE_LRU_POS_EMPTY; if (!blocks_empty) From 434a3f2ffaa1519a562909a92c62b77cf29f05da Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Mar 2025 14:30:29 -0500 Subject: [PATCH 119/180] bcachefs: trace_stripe_create Add a simple tracepoint for stripe creation, we'll want to expand this later. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 5 +++++ fs/bcachefs/errcode.h | 2 ++ fs/bcachefs/trace.h | 24 ++++++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 84f232f4cbf8..37269c0f79b5 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1486,6 +1486,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (s->err) { if (!bch2_err_matches(s->err, EROFS)) bch_err(c, "error creating stripe: error writing data buckets"); + ret = s->err; goto err; } @@ -1494,6 +1495,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); + ret = -BCH_ERR_ec_block_read; goto err; } @@ -1519,6 +1521,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); + ret = -BCH_ERR_ec_block_write; goto err; } @@ -1540,6 +1543,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ret) goto err; err: + trace_stripe_create(c, s->idx, ret); + bch2_disk_reservation_put(c, &s->res); for (i = 0; i < v->nr_blocks; i++) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index d45ef03abc91..e14e0d1cc93d 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -280,6 +280,8 @@ x(EIO, insufficient_journal_devices) \ x(EIO, device_offline) \ x(EIO, EIO_fault_injected) \ + x(EIO, ec_block_read) \ + x(EIO, ec_block_write) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 5718988dd7d6..c8669a6b9cec 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -339,6 +339,30 @@ DEFINE_EVENT(bio, io_read_reuse_race, TP_ARGS(bio) ); +/* ec.c */ + +TRACE_EVENT(stripe_create, + TP_PROTO(struct bch_fs *c, u64 idx, int ret), + TP_ARGS(c, idx, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, idx ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->idx = idx; + __entry->ret = ret; + ), + + TP_printk("%d,%d idx %llu ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->idx, + __entry->ret) +); + /* Journal */ DEFINE_EVENT(bch_fs, journal_full, From 94373026d930b9ed72c8f8f0f3d532e13654fdb1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 20:15:40 -0500 Subject: [PATCH 120/180] bcachefs: We no longer read stripes into memory at startup And the stripes heap gets deleted. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 4 - fs/bcachefs/ec.c | 223 +--------------------------- fs/bcachefs/ec.h | 5 - fs/bcachefs/ec_types.h | 7 - fs/bcachefs/recovery_passes_types.h | 2 +- fs/bcachefs/sysfs.c | 5 - 6 files changed, 2 insertions(+), 244 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 8abefc994016..b432bb6e6f6e 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1003,15 +1003,11 @@ struct bch_fs { wait_queue_head_t copygc_running_wq; /* STRIPES: */ - GENRADIX(struct stripe) stripes; GENRADIX(struct gc_stripe) gc_stripes; struct hlist_head ec_stripes_new[32]; spinlock_t ec_stripes_new_lock; - ec_stripes_heap ec_stripes_heap; - struct mutex ec_stripes_heap_lock; - /* ERASURE CODING */ struct list_head ec_stripe_head_list; struct mutex ec_stripe_head_lock; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 37269c0f79b5..c73ba73f6890 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -380,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans, return 0; } -static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) -{ - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->disk_label = s->disk_label; - m->blocks_nonempty = 0; - - for (unsigned i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); -} - int bch2_trigger_stripe(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, @@ -495,38 +482,6 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; } - if (flags & BTREE_TRIGGER_atomic) { - struct stripe *m = genradix_ptr(&c->stripes, idx); - - if (!m) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_bkey_val_to_text(&buf1, c, old); - bch2_bkey_val_to_text(&buf2, c, new); - bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" - "old %s\n" - "new %s", idx, buf1.buf, buf2.buf); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - bch2_inconsistent_error(c); - return -1; - } - - if (!new_s) { - bch2_stripes_heap_del(c, m, idx); - - memset(m, 0, sizeof(*m)); - } else { - stripe_to_mem(m, new_s); - - if (!old_s) - bch2_stripes_heap_insert(c, m, idx); - else - bch2_stripes_heap_update(c, m, idx); - } - } - return 0; } @@ -942,26 +897,6 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { - ec_stripes_heap n, *h = &c->ec_stripes_heap; - - if (idx >= h->size) { - if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - - mutex_lock(&c->ec_stripes_heap_lock); - if (n.size > h->size) { - memcpy(n.data, h->data, h->nr * sizeof(h->data[0])); - n.nr = h->nr; - swap(*h, n); - } - mutex_unlock(&c->ec_stripes_heap_lock); - - free_heap(&n); - } - - if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; @@ -1034,120 +969,6 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) s->idx = 0; } -/* Heap of all existing stripes, ordered by blocks_nonempty */ - -static u64 stripe_idx_to_delete(struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - - lockdep_assert_held(&c->ec_stripes_heap_lock); - - if (h->nr && - h->data[0].blocks_nonempty == 0 && - !bch2_stripe_is_open(c, h->data[0].idx)) - return h->data[0].idx; - - return 0; -} - -static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, - size_t i) -{ - struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); - - genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; -} - -static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - - return ((_l->blocks_nonempty > _r->blocks_nonempty) < - (_l->blocks_nonempty < _r->blocks_nonempty)); -} - -static inline void ec_stripes_heap_swap(void *l, void *r, void *h) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - ec_stripes_heap *_h = (ec_stripes_heap *)h; - size_t i = _l - _h->data; - size_t j = _r - _h->data; - - swap(*_l, *_r); - - ec_stripes_heap_set_backpointer(_h, i); - ec_stripes_heap_set_backpointer(_h, j); -} - -static const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, -}; - -static void heap_verify_backpointer(struct bch_fs *c, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m = genradix_ptr(&c->stripes, idx); - - BUG_ON(m->heap_idx >= h->nr); - BUG_ON(h->data[m->heap_idx].idx != idx); -} - -void bch2_stripes_heap_del(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_insert(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - BUG_ON(min_heap_full(&c->ec_stripes_heap)); - - genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr; - min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) { - .idx = idx, - .blocks_nonempty = m->blocks_nonempty, - }), - &callbacks, - &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_update(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - bool do_deletes; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; - - i = m->heap_idx; - min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap); - min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - - do_deletes = stripe_idx_to_delete(c) != 0; - mutex_unlock(&c->ec_stripes_heap_lock); - - if (do_deletes) - bch2_do_stripe_deletes(c); -} - /* stripe deletion */ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) @@ -2395,46 +2216,7 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_prefetch, k, ({ - if (k.k->type != KEY_TYPE_stripe) - continue; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - break; - - struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); - - stripe_to_mem(m, bkey_s_c_to_stripe(k).v); - - bch2_stripes_heap_insert(c, m, k.k->p.offset); - 0; - }))); - bch_err_fn(c, ret); - return ret; -} - -void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min_t(size_t, h->nr, 50); i++) { - m = genradix_ptr(&c->stripes, h->data[i].idx); - - prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, - h->data[i].blocks_nonempty, - m->nr_blocks - m->nr_redundant, - m->nr_redundant); - if (bch2_stripe_is_open(c, h->data[i].idx)) - prt_str(out, " open"); - prt_newline(out); - } - mutex_unlock(&c->ec_stripes_heap_lock); + return 0; } static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -2505,15 +2287,12 @@ void bch2_fs_ec_exit(struct bch_fs *c) BUG_ON(!list_empty(&c->ec_stripe_new_list)); - free_heap(&c->ec_stripes_heap); - genradix_free(&c->stripes); bioset_exit(&c->ec_bioset); } void bch2_fs_ec_init_early(struct bch_fs *c) { spin_lock_init(&c->ec_stripes_new_lock); - mutex_init(&c->ec_stripes_heap_lock); INIT_LIST_HEAD(&c->ec_stripe_head_list); mutex_init(&c->ec_stripe_head_lock); diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 3008d41db12d..8f2228e59eda 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -258,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, unsigned, unsigned, unsigned, enum bch_watermark, struct closure *); -void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); - void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); @@ -298,7 +294,6 @@ void bch2_fs_ec_flush(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); -void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_ec_exit(struct bch_fs *); diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 37558cc2d89f..06144bfd9c19 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -31,11 +31,4 @@ struct gc_stripe { struct bch_replicas_padded r; }; -struct ec_stripe_heap_entry { - size_t idx; - unsigned blocks_nonempty; -}; - -typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap; - #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 418557960ed6..e89b9c783285 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -24,7 +24,7 @@ x(check_topology, 4, 0) \ x(accounting_read, 39, PASS_ALWAYS) \ x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, PASS_ALWAYS) \ + x(stripes_read, 1, 0) \ x(initialize_subvolumes, 2, 0) \ x(snapshots_read, 3, PASS_ALWAYS) \ x(check_allocations, 5, PASS_FSCK) \ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index a9953181c29b..2ed3f755eadd 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -174,7 +174,6 @@ read_attribute(journal_debug); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_reserve_cache); -read_attribute(stripes_heap); read_attribute(open_buckets); read_attribute(open_buckets_partial); read_attribute(nocow_lock_table); @@ -355,9 +354,6 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_reserve_cache) bch2_btree_reserve_cache_to_text(out, c); - if (attr == &sysfs_stripes_heap) - bch2_stripes_heap_to_text(out, c); - if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, NULL); @@ -566,7 +562,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_key_cache, &sysfs_btree_reserve_cache, &sysfs_new_stripes, - &sysfs_stripes_heap, &sysfs_open_buckets, &sysfs_open_buckets_partial, #ifdef BCH_WRITE_REF_DEBUG From c073ec6bec0d05781380ecabca9e8611e4b48502 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 8 Mar 2025 20:53:53 +0100 Subject: [PATCH 121/180] bcachefs: Remove unnecessary byte allocation The extra byte is not used - remove it. Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 17c035f9d629..5b47b94fe1ea 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -244,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, int ret = 0; subvol_inum inum; - kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL); if (!kname) return -ENOMEM; From ff4cb203ccce24630c50a503973ac596c3d5d1be Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 11 Mar 2025 12:13:11 +0100 Subject: [PATCH 122/180] bcachefs: Use max() to improve gen_after() Use max() to simplify gen_after() and improve its readability. Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 6aeec1c0973c..c5363256e363 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -140,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b) static inline int gen_after(u8 a, u8 b) { - int r = gen_cmp(a, b); - - return r > 0 ? r : 0; + return max(0, gen_cmp(a, b)); } static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) From a2e9e6874612582367be674e4d961de2ec8a9d05 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Mar 2025 09:31:03 -0400 Subject: [PATCH 123/180] bcachefs: Kill a bit of dead code Found with CC=clang W=1 Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 14 -------------- fs/bcachefs/inode.c | 13 ------------- fs/bcachefs/journal_io.c | 5 ----- fs/bcachefs/move.c | 2 -- 4 files changed, 34 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e32fce4fd258..7542c6f9c88e 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_peek(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->key.k.p; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 339b80770f1d..7aca010e2e10 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -868,19 +868,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, uid, gid, mode, rdev, parent); } -static inline u32 bkey_generation(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - BUG(); - case KEY_TYPE_inode_generation: - return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); - default: - return 0; - } -} - static struct bkey_i_inode_alloc_cursor * bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) { diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 331c9d762439..cf2700b06d58 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1609,11 +1609,6 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) kvfree(new_buf); } -static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) -{ - return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); -} - static CLOSURE_CALLBACK(journal_write_done) { closure_type(w, struct journal_buf, io); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index ee489d222fba..0787d04a5fc3 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -712,7 +712,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; struct bkey_s_c k; - unsigned sectors_moved = 0; struct bkey_buf last_flushed; int ret = 0; @@ -834,7 +833,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (ctxt->stats) atomic64_add(sectors, &ctxt->stats->sectors_seen); - sectors_moved += sectors; next: bch2_btree_iter_advance(&bp_iter); } From 8dc4514d58f684b9bc08d956ab9a9ec65b38f63a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 11:44:52 -0400 Subject: [PATCH 124/180] bcachefs: Kill bch2_remount() Single caller, so inline it. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 71 ++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 459ca8259fc0..17ac9c55fb96 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2026,44 +2026,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static int bch2_remount(struct super_block *sb, int *flags, - struct bch_opts opts) -{ - struct bch_fs *c = sb->s_fs_info; - int ret = 0; - - opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - - if (opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); - - if (opts.read_only) { - bch2_fs_read_only(c); - - sb->s_flags |= SB_RDONLY; - } else { - ret = bch2_fs_read_write(c); - if (ret) { - bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; - } - - sb->s_flags &= ~SB_RDONLY; - } - - c->opts.read_only = opts.read_only; - - up_write(&c->state_lock); - } - - if (opt_defined(opts, errors)) - c->opts.errors = opts.errors; -err: - return bch2_err_class(ret); -} - static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; @@ -2374,8 +2336,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct bch2_opts_parse *opts = fc->fs_private; + struct bch_fs *c = sb->s_fs_info; + int ret = 0; - return bch2_remount(sb, &fc->sb_flags, opts->opts); + opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); + + if (opts->opts.read_only != c->opts.read_only) { + down_write(&c->state_lock); + + if (opts->opts.read_only) { + bch2_fs_read_only(c); + + sb->s_flags |= SB_RDONLY; + } else { + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); + up_write(&c->state_lock); + ret = -EINVAL; + goto err; + } + + sb->s_flags &= ~SB_RDONLY; + } + + c->opts.read_only = opts->opts.read_only; + + up_write(&c->state_lock); + } + + if (opt_defined(opts->opts, errors)) + c->opts.errors = opts->opts.errors; +err: + return bch2_err_class(ret); } static const struct fs_context_operations bch2_context_ops = { From c991fbee8e6e91e9d0c859627b87fb7a06244a8b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 15:21:13 -0400 Subject: [PATCH 125/180] bcachefs: rebalance, copygc status also print stacktrace These are commonly needed when debugging, and saves from having to ask users to dig. Also, rebalance_status now includes pending rebalance work. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 14 ++++++++------ fs/bcachefs/movinggc.c | 11 +++++++++++ fs/bcachefs/rebalance.c | 29 ++++++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 0787d04a5fc3..f86fb8ad636a 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -1251,17 +1251,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); - prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); - prt_printf(out, "bytes seen: "); + prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_printf(out, "bytes moved: "); + prt_printf(out, "bytes moved:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_printf(out, "bytes raced: "); + prt_printf(out, "bytes raced:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1270,7 +1270,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) { - struct moving_io *io; + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); @@ -1290,6 +1291,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str printbuf_indent_add(out, 2); mutex_lock(&ctxt->lock); + struct moving_io *io; list_for_each_entry(io, &ctxt->ios, io_list) bch2_data_update_inflight_to_text(out, &io->write); mutex_unlock(&ctxt->lock); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index fa19fc44622c..5126c870ce5b 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "Currently calculated wait:\t"); prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } } static int bch2_copygc_thread(void *arg) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 58f6d97e506c..8b6795ec82f6 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -590,8 +590,19 @@ static int bch2_rebalance_thread(void *arg) void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) { + printbuf_tabstop_push(out, 32); + struct bch_fs_rebalance *r = &c->rebalance; + /* print pending work */ + struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, }; + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + + prt_printf(out, "pending work:\t"); + prt_human_readable_u64(out, v); + prt_printf(out, "\n\n"); + prt_str(out, bch2_rebalance_state_strs[r->state]); prt_newline(out); printbuf_indent_add(out, 2); @@ -600,15 +611,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) case BCH_REBALANCE_waiting: { u64 now = atomic64_read(&c->io_clock[WRITE].now); - prt_str(out, "io wait duration: "); + prt_printf(out, "io wait duration:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); - prt_str(out, "io wait remaining: "); + prt_printf(out, "io wait remaining:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); - prt_str(out, "duration waited: "); + prt_printf(out, "duration waited:\t"); bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); prt_newline(out); break; @@ -621,6 +632,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) break; } prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } + printbuf_indent_sub(out, 2); } From 7c1e2a254fbc023df8d681946bab69cd68a4bde6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 14 Mar 2025 18:19:17 -0400 Subject: [PATCH 126/180] bcachefs: Add a cond_resched() to btree cache teardown [12308.606480] watchdog: BUG: soft lockup - CPU#18 stuck for 26s! [umount:48479] [12308.606485] Modules linked in: bcachefs lz4hc_compress lz4_compress lz4_decompress sunrpc overlay nf_conntrack_netlink xt_nat xt_tcpudp veth xt_conntrack xt_MASQUERADE bridge stp llc xfrm_user ip6table_nat ip6table_filter ip6_tables iptable_nat xt_addrtype iptable_filter ip_tables x_tables nfnetlink_cttimeout nfnetlink openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 psample ext4 mbcache jbd2 nls_iso8859_1 nls_cp850 vfat fat binfmt_misc skx_edac_common nfit edac_core libnvdimm cbc encrypted_keys intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common ipmi_ssif x86_pkg_temp_thermal intel_powerclamp kvm_intel kvm drivetemp rapl intel_cstate coretemp mgag200 i2c_algo_bit ixgbe drm_shmem_helper drm_kms_helper mdio_devres xfrm_algo mdio drm ptp intel_uncore mei_me efi_pstore evdev uas pl2303 pps_core libphy usb_storage usbserial lpc_ich mei drm_panel_orientation_quirks acpi_power_meter tiny_power_button ipmi_si mfd_core intel_pch_thermal acpi_tad acpi_ipmi ioatdma [12308.606541] ipmi_devintf ipmi_msghandler dca wmi button efivarfs polyval_clmulni polyval_generic ghash_clmulni_intel sha512_ssse3 sha256_ssse3 sha1_ssse3 sha1_generic xhci_pci xhci_hcd aesni_intel ehci_pci ehci_hcd gf128mul crypto_simd cryptd usbcore hpwdt usb_common [12308.606557] CPU: 18 UID: 0 PID: 48479 Comm: umount Tainted: G L 6.14.0-rc6-x86_64-00159-ga09496a03e63 #1 [12308.606560] Tainted: [L]=SOFTLOCKUP [12308.606561] Hardware name: HPE ProLiant DL380 Gen10/ProLiant DL380 Gen10, BIOS U30 07/20/2023 [12308.606563] RIP: 0010:clear_page_erms+0x7/0x10 [12308.606570] Code: 48 89 47 38 48 8d 7f 40 75 d9 90 c3 cc cc cc cc 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 b9 00 10 00 00 31 c0 aa c3 cc cc cc cc 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 [12308.606572] RSP: 0018:ffff9ed5b622fba0 EFLAGS: 00010246 [12308.606574] RAX: 0000000000000000 RBX: ffff90347fffe6c0 RCX: 00000000000004c0 [12308.606575] RDX: ffffe34ea9bec1c0 RSI: 00000000000405f0 RDI: ffff902eafb07b40 [12308.606576] RBP: ffff9ed5b622fbf0 R08: 0000000000000001 R09: 0000000000000006 [12308.606577] R10: 0000000000040001 R11: 0000000000000000 R12: ffffe34ea9bec000 [12308.606578] R13: 0000000000000000 R14: 0000000000000006 R15: ffffe34ea9bed000 [12308.606580] FS: 00007fe704ecfb68(0000) GS:ffff9053fea00000(0000) knlGS:0000000000000000 [12308.606581] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [12308.606582] CR2: 00007f18159068ae CR3: 00000001314d0005 CR4: 00000000007726f0 [12308.606583] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [12308.606584] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [12308.606584] PKRU: 55555554 [12308.606585] Call Trace: [12308.606587] [12308.606590] ? show_regs.cold+0x19/0x28 [12308.606595] ? watchdog_timer_fn.cold+0x3d/0x9d [12308.606598] ? __pfx_watchdog_timer_fn+0x10/0x10 [12308.606602] ? __hrtimer_run_queues+0x12e/0x250 [12308.606607] ? hrtimer_interrupt+0xfd/0x220 [12308.606609] ? __sysvec_apic_timer_interrupt+0x53/0xe0 [12308.606614] ? sysvec_apic_timer_interrupt+0x76/0xa0 [12308.606619] [12308.606620] [12308.606620] ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 [12308.606626] ? clear_page_erms+0x7/0x10 [12308.606628] ? __free_pages_ok+0x374/0x640 [12308.606633] free_frozen_pages+0x34/0x570 [12308.606636] __folio_put+0x87/0xe0 [12308.606641] free_large_kmalloc+0x70/0x80 [12308.606645] kfree+0x2f6/0x390 [12308.606648] kvfree+0x2d/0x40 [12308.606653] __btree_node_data_free+0xaf/0xf0 [bcachefs] [12308.606726] btree_node_data_free+0x6a/0x80 [bcachefs] [12308.606778] bch2_fs_btree_cache_exit+0x262/0x440 [bcachefs] [12308.606829] bch2_fs_release+0xe8/0x340 [bcachefs] [12308.606905] kobject_put+0x60/0xc0 [12308.606908] bch2_fs_free+0xdd/0x120 [bcachefs] [12308.606981] bch2_kill_sb+0x1e/0x30 [bcachefs] [12308.607051] deactivate_locked_super+0x32/0xb0 [12308.607055] deactivate_super+0x40/0x50 [12308.607057] cleanup_mnt+0xc3/0x160 [12308.607060] __cleanup_mnt+0x12/0x20 [12308.607062] task_work_run+0x5f/0xa0 [12308.607064] syscall_exit_to_user_mode+0x194/0x1a0 [12308.607066] do_syscall_64+0x67/0x170 [12308.607068] entry_SYSCALL_64_after_hwframe+0x76/0x7e [12308.607070] RIP: 0033:0x7fe704e66eed [12308.607073] Code: 08 49 89 ca b8 a5 00 00 00 0f 05 48 89 c7 e8 8a e6 ff ff 48 83 c4 Reported-by: Stijn Tintel Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 1ec1f90e0eb3..54666027aa85 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) btree_node_write_in_flight(b)); btree_node_data_free(bc, b); + cond_resched(); } BUG_ON(!bch2_journal_error(&c->journal) && From 9ec00891493d3e4f60678ed12988761538f95bd1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 00:47:51 -0400 Subject: [PATCH 127/180] bcachefs: bch2_bkey_ptrs_rebalance_opts() Small optimization for bch2_bkey_sectors_need_rebalance() Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 8b6795ec82f6..29a569384146 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -26,9 +26,8 @@ /* bch_extent_rebalance: */ -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; bkey_extent_entry_for_each(ptrs, entry) @@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s return NULL; } +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); +} + static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_s_c k, @@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) { - const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); if (!opts) return 0; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; From 6d80fca9efe9255369aa91e85e8f3367c42acdde Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Mar 2025 11:54:13 -0400 Subject: [PATCH 128/180] bcachefs: Don't create bch_io_failures unless it's needed Only needed in retry path, no point in wasting stack space. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 73275da5d2c4..6bdb8efb4cd1 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -147,13 +147,11 @@ void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) { - struct bch_io_failures failed = { .nr = 0 }; - BUG_ON(rbio->_state); rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, NULL, BCH_READ_retry_if_stale| BCH_READ_may_promote| BCH_READ_user_mapped); From 5a06cb8000addbbfd1f9ce6891098da5b48d3d1e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 18:42:56 -0500 Subject: [PATCH 129/180] bcachefs: Debug params for data corruption injection dm-flakey is busted, and this is simpler anyways - this lets us test the checksum error retry ptahs Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 8 ++++++++ fs/bcachefs/io_write.c | 24 ++++++++++++++++++++++++ fs/bcachefs/util.c | 21 +++++++++++++++++++++ fs/bcachefs/util.h | 12 ++++++++++++ 4 files changed, 65 insertions(+) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 70e5c5a32d01..d39f321b51fc 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -27,6 +27,12 @@ #include +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_read_corrupt_ratio; +module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(read_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -688,6 +694,8 @@ static void __bch2_read_endio(struct work_struct *work) src->bi_iter = rbio->bvec_iter; } + bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index dbfcb28f003d..48befbae0226 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -34,6 +34,12 @@ #include #include +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_write_corrupt_ratio; +module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(write_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, @@ -1005,6 +1011,15 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bounce = true; } +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); + if (!bounce && write_corrupt_ratio) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bounce = true; + } +#endif saved_iter = dst->bi_iter; do { @@ -1114,6 +1129,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, init_append_extent(op, wp, version, crc); +#ifdef CONFIG_BCACHEFS_DEBUG + if (write_corrupt_ratio) { + swap(dst->bi_iter.bi_size, dst_len); + bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); + swap(dst->bi_iter.bi_size, dst_len); + } +#endif + if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); @@ -1394,6 +1417,7 @@ static void bch2_nocow_write(struct bch_write_op *op) bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, op->insert_keys.top, true); diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index a7edbcca1a84..553de8d8e3e5 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -704,6 +704,27 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *bio) +{ + struct bvec_iter iter; + struct bio_vec bv; + unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); + + bio_for_each_segment(bv, bio, iter) { + unsigned u64s = bv.bv_len / sizeof(u64); + + if (offset < u64s) { + u64 *segment = bvec_kmap_local(&bv); + segment[offset] = get_random_u64(); + kunmap_local(segment); + return; + } + offset -= u64s; + } +} +#endif + #if 0 void eytzinger1_test(void) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index f4a4783219d9..d41e133acc4d 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -406,6 +406,18 @@ u64 bch2_get_random_u64_below(u64); void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *); + +static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) +{ + if (ratio && !get_random_u32_below(ratio)) + bch2_corrupt_bio(bio); +} +#else +#define bch2_maybe_corrupt_bio(...) do {} while (0) +#endif + static inline void memcpy_u64s_small(void *dst, const void *src, unsigned u64s) { From 943f0cfb1559ac6c9fc9082998f20dfe2aa01a74 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 7 Mar 2025 17:20:22 -0500 Subject: [PATCH 130/180] bcachefs: Convert read path to standard error codes Kill the READ_ERR/READ_RETRY/READ_RETRY_AVOID enums, and add standard error codes that describe precisely which error occured. This is going to be used for the data move path, to move but poison extents with checksum errors. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 13 ++++++ fs/bcachefs/io_read.c | 95 +++++++++++++++++++++++-------------------- fs/bcachefs/io_read.h | 4 +- 3 files changed, 68 insertions(+), 44 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index e14e0d1cc93d..5050d978624b 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -282,6 +282,19 @@ x(EIO, EIO_fault_injected) \ x(EIO, ec_block_read) \ x(EIO, ec_block_write) \ + x(EIO, data_read) \ + x(BCH_ERR_data_read, data_read_retry) \ + x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \ + x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\ + x(BCH_ERR_data_read, data_read_decompress_err) \ + x(BCH_ERR_data_read, data_read_decrypt_err) \ + x(BCH_ERR_data_read, data_read_ptr_stale_race) \ + x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ + x(BCH_ERR_data_read, data_read_no_encryption_key) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index d39f321b51fc..797c29bde9b6 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -347,10 +347,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); } -#define READ_RETRY_AVOID 1 -#define READ_RETRY 2 -#define READ_ERR 3 - enum rbio_context { RBIO_CONTEXT_NULL, RBIO_CONTEXT_HIGHPRI, @@ -452,7 +448,7 @@ static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_ err: bch2_trans_iter_exit(trans, &iter); - if (ret == READ_RETRY) + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; if (ret) rbio->bio.bi_status = BLK_STS_IOERR; @@ -479,11 +475,13 @@ static void bch2_rbio_retry(struct work_struct *work) this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], bvec_iter_sectors(rbio->bvec_iter)); - if (rbio->retry == READ_RETRY_AVOID) + if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(&failed, &rbio->pick); - if (!rbio->split) - rbio->bio.bi_status = 0; + if (!rbio->split) { + rbio->bio.bi_status = 0; + rbio->ret = 0; + } rbio = bch2_rbio_free(rbio); @@ -498,23 +496,29 @@ static void bch2_rbio_retry(struct work_struct *work) __bch2_read(c, rbio, iter, inum, &failed, flags); } -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - blk_status_t error) +static void bch2_rbio_error(struct bch_read_bio *rbio, + int ret, blk_status_t blk_error) { - rbio->retry = retry; - rbio->saw_error = true; + BUG_ON(ret >= 0); + + rbio->ret = ret; + rbio->bio.bi_status = blk_error; + + bch2_rbio_parent(rbio)->saw_error = true; if (rbio->flags & BCH_READ_in_retry) return; - if (retry == READ_ERR) { - rbio = bch2_rbio_free(rbio); - - rbio->bio.bi_status = error; - bch2_rbio_done(rbio); - } else { + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { bch2_rbio_punt(rbio, bch2_rbio_retry, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } else { + rbio = bch2_rbio_free(rbio); + + rbio->ret = ret; + rbio->bio.bi_status = blk_error; + + bch2_rbio_done(rbio); } } @@ -536,7 +540,7 @@ static void bch2_read_io_err(struct work_struct *work) bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); } static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, @@ -623,7 +627,7 @@ static void bch2_read_csum_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -643,7 +647,7 @@ static void bch2_read_decompress_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -663,7 +667,7 @@ static void bch2_read_decrypt_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -706,7 +710,8 @@ static void __bch2_read_endio(struct work_struct *work) */ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { rbio->flags |= BCH_READ_must_bounce; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, + BLK_STS_IOERR); goto out; } @@ -820,9 +825,9 @@ static void bch2_read_endio(struct bio *bio) trace_and_count(c, io_read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_retry_if_stale) - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); else - bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); return; } @@ -895,7 +900,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_read_bio *rbio = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); - int pick_ret; + int ret = 0; if (bkey_extent_is_inline_data(k.k)) { unsigned bytes = min_t(unsigned, iter.bi_size, @@ -911,16 +916,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto out_read_done; } retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); + ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); /* hole or reservation - just zero fill: */ - if (!pick_ret) + if (!ret) goto hole; - if (unlikely(pick_ret < 0)) { + if (unlikely(ret < 0)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); + prt_printf(&buf, "%s\n ", bch2_err_str(ret)); bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "%s", buf.buf); @@ -936,6 +941,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); + ret = -BCH_ERR_data_read_no_encryption_key; goto err; } @@ -1071,7 +1077,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; rbio->hole = 0; - rbio->retry = 0; + rbio->ret = 0; rbio->context = 0; rbio->pick = pick; rbio->subvol = orig->subvol; @@ -1126,7 +1132,9 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, + -BCH_ERR_data_read_retry_device_offline, + BLK_STS_IOERR); goto out; } @@ -1152,7 +1160,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(trans, rbio, k)) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, + BLK_STS_IOERR); goto out; } @@ -1170,13 +1179,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); - ret = rbio->retry; + ret = rbio->ret; rbio = bch2_rbio_free(rbio); - if (ret == READ_RETRY_AVOID) { + if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(failed, &pick); - ret = READ_RETRY; - } if (!ret) goto out_read_done; @@ -1186,9 +1193,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, err: if (flags & BCH_READ_in_retry) - return READ_ERR; + return ret; - orig->bio.bi_status = BLK_STS_IOERR; + orig->bio.bi_status = BLK_STS_IOERR; + orig->ret = ret; goto out_read_done; hole: @@ -1285,8 +1293,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - ret != READ_RETRY && - ret != READ_RETRY_AVOID) + !bch2_err_matches(ret, BCH_ERR_data_read_retry)) break; } @@ -1297,11 +1304,13 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); + prt_printf(&buf, "read error %s from btree lookup", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - rbio->bio.bi_status = BLK_STS_IOERR; + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + bch2_rbio_done(rbio); } diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 6bdb8efb4cd1..1eb01e9847d7 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -42,11 +42,11 @@ struct bch_read_bio { narrow_crcs:1, hole:1, saw_error:1, - retry:2, context:2; }; u16 _state; }; + s16 ret; struct extent_ptr_decoded pick; @@ -164,6 +164,7 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, rbio->c = orig->c; rbio->_state = 0; + rbio->ret = 0; rbio->split = true; rbio->parent = orig; rbio->opts = orig->opts; @@ -180,6 +181,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, rbio->start_time = local_clock(); rbio->c = c; rbio->_state = 0; + rbio->ret = 0; rbio->opts = opts; rbio->bio.bi_end_io = end_io; return rbio; From e75993b0bf8baa48b2e96d693852191f63b615fd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Mar 2025 09:04:09 -0400 Subject: [PATCH 131/180] bcachefs: Fix BCH_ERR_data_read_csum_err_maybe_userspace in retry path When we do a read to a buffer that's mapped into userspace, it's possible to get a spurious checksum error if userspace was modified the buffer at the same time. When we retry those, they have to be bounced before we know definitively whether we're reading corrupt data. But the retry path propagates read flags differently, so needs special handling. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 797c29bde9b6..17bc413c27ba 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1291,6 +1291,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); err: + if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) + flags |= BCH_READ_must_bounce; + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_data_read_retry)) From f4b84bac20d6999db9e7db2254e63471c6c3fd9c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 11:24:22 -0500 Subject: [PATCH 132/180] bcachefs: Read error message now indicates if it was for an internal move Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 17bc413c27ba..a7865f34ea35 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -335,10 +335,17 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bch_read_bio *rbio, struct bpos read_pos) { - return lockrestart_do(trans, + int ret = lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, out, (subvol_inum) { rbio->subvol, read_pos.inode }, read_pos.offset << 9)); + if (ret) + return ret; + + if (rbio->flags & BCH_READ_data_update) + prt_str(out, "(internal move) "); + + return 0; } static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, From 881b598ef144a1dee3850be9e6b9ecfcfc5bf4b0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 11:37:51 -0500 Subject: [PATCH 133/180] bcachefs: BCH_ERR_data_read_buffer_too_small Now that the read path uses proper error codes, we can get rid of the weird rbio->hole signalling to the move path that the read didn't happen. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 2 ++ fs/bcachefs/io_read.c | 9 ++++----- fs/bcachefs/io_read.h | 1 - fs/bcachefs/move.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 5050d978624b..afa16d58041e 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -295,6 +295,8 @@ x(BCH_ERR_data_read, data_read_ptr_stale_race) \ x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ x(BCH_ERR_data_read, data_read_no_encryption_key) \ + x(BCH_ERR_data_read, data_read_buffer_too_small) \ + x(BCH_ERR_data_read, data_read_key_overwritten) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index a7865f34ea35..d1497af58180 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -443,7 +443,7 @@ static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_ if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ - rbio->hole = true; + rbio->ret = -BCH_ERR_data_read_key_overwritten; goto err; } @@ -1000,10 +1000,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, */ struct data_update *u = container_of(orig, struct data_update, rbio); if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { - BUG(); if (ca) percpu_ref_put(&ca->io_ref); - goto hole; + rbio->ret = -BCH_ERR_data_read_buffer_too_small; + goto out_read_done; } iter.bi_size = pick.crc.compressed_size << 9; @@ -1083,7 +1083,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->flags = flags; rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; - rbio->hole = 0; rbio->ret = 0; rbio->context = 0; rbio->pick = pick; @@ -1215,7 +1214,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, * to read no longer exists we have to signal that: */ if (flags & BCH_READ_data_update) - orig->hole = true; + orig->ret = -BCH_ERR_data_read_key_overwritten; zero_fill_bio_iter(&orig->bio, iter); out_read_done: diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 1eb01e9847d7..924406558f78 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -40,7 +40,6 @@ struct bch_read_bio { split:1, have_ioref:1, narrow_crcs:1, - hole:1, saw_error:1, context:2; }; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index f86fb8ad636a..307b918fa2e7 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -125,8 +125,8 @@ static void move_write(struct moving_io *io) &ctxt->stats->sectors_error_corrected); } - if (unlikely(io->write.rbio.bio.bi_status || - io->write.rbio.hole || + if (unlikely(io->write.rbio.ret || + io->write.rbio.bio.bi_status || io->write.data_opts.scrub)) { move_free(io); return; From de73677ff8e677bf84a0eefa17b3913f65b57a74 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 19:37:10 -0500 Subject: [PATCH 134/180] bcachefs: Return errors to top level bch2_rbio_retry() Next patch will be adding an additional retry loop for checksum errors, so that we can rule out transient errors before marking an extent as poisoned. Prerequisite to this is returning errors to bch2_rbio_retry(); this will also let us add a "successful retry" message. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 41 ++++++++++++++++++++++++++--------------- fs/bcachefs/io_read.h | 4 ++-- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index d1497af58180..e54103f79323 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -422,7 +422,7 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } -static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, +static noinline int bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) @@ -457,12 +457,16 @@ static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_ if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; - if (ret) - rbio->bio.bi_status = BLK_STS_IOERR; + + if (ret) { + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + } BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); - bch2_rbio_done(rbio); bch2_trans_put(trans); + + return ret; } static void bch2_rbio_retry(struct work_struct *work) @@ -497,10 +501,16 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_last_fragment; flags |= BCH_READ_must_clone; - if (flags & BCH_READ_data_update) - bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); - else - __bch2_read(c, rbio, iter, inum, &failed, flags); + int ret = flags & BCH_READ_data_update + ? bch2_read_retry_nodecode(c, rbio, iter, &failed, flags) + : __bch2_read(c, rbio, iter, inum, &failed, flags); + + if (ret) { + rbio->ret = ret; + rbio->bio.bi_status = BLK_STS_IOERR; + } + + bch2_rbio_done(rbio); } static void bch2_rbio_error(struct bch_read_bio *rbio, @@ -1191,9 +1201,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(failed, &pick); - if (!ret) - goto out_read_done; - return ret; } @@ -1218,12 +1225,13 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, zero_fill_bio_iter(&orig->bio, iter); out_read_done: - if (flags & BCH_READ_last_fragment) + if ((flags & BCH_READ_last_fragment) && + !(flags & BCH_READ_in_retry)) bch2_rbio_done(orig); return 0; } -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, +int __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, subvol_inum inum, struct bch_io_failures *failed, unsigned flags) { @@ -1313,18 +1321,21 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9)); - prt_printf(&buf, "read error %s from btree lookup", bch2_err_str(ret)); + prt_printf(&buf, "read error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); rbio->bio.bi_status = BLK_STS_IOERR; rbio->ret = ret; - bch2_rbio_done(rbio); + if (!(flags & BCH_READ_in_retry)) + bch2_rbio_done(rbio); } bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); + + return ret; } void bch2_fs_io_read_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 924406558f78..42a22985d789 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -140,8 +140,8 @@ static inline void bch2_read_extent(struct btree_trans *trans, data_btree, k, offset_into_extent, NULL, flags, -1); } -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); +int __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, struct bch_io_failures *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) From ccba9029b01cdcc1aa6f3ed6375efdc0d779cc8f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 18:42:34 -0500 Subject: [PATCH 135/180] bcachefs: Print message on successful read retry Users have been asking for this, and now that errors are returned to the top level read retry path - we can. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index e54103f79323..887e3c9ac181 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -494,6 +494,9 @@ static void bch2_rbio_retry(struct work_struct *work) rbio->ret = 0; } + unsigned subvol = rbio->subvol; + struct bpos read_pos = rbio->read_pos; + rbio = bch2_rbio_free(rbio); flags |= BCH_READ_in_retry; @@ -508,6 +511,19 @@ static void bch2_rbio_retry(struct work_struct *work) if (ret) { rbio->ret = ret; rbio->bio.bi_status = BLK_STS_IOERR; + } else { + struct printbuf buf = PRINTBUF; + + bch2_trans_do(c, + bch2_inum_offset_err_msg_trans(trans, &buf, + (subvol_inum) { subvol, read_pos.inode }, + read_pos.offset << 9)); + if (rbio->flags & BCH_READ_data_update) + prt_str(&buf, "(internal move) "); + prt_str(&buf, "successful retry"); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); } bch2_rbio_done(rbio); From be31e412ac01f49bf7afa8eaa93dac399914a0a1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Mar 2025 12:56:43 -0500 Subject: [PATCH 136/180] bcachefs: Checksum errors get additional retries It's possible for checksum errors to be transient - e.g. flakey controller or cable, thus we need additional retries (besides retrying from different replicas) before we can definitely return an error. This is particularly important for the next patch, which will allow the data move path to move extents with checksum errors - we don't want to accidentally introduce bitrot due to a transient error! - bch2_bkey_pick_read_device() is substantially reworked, and bch2_dev_io_failures is expanded to record more information about the type of failure (i.e. number of checksum errors). It now returns an error code that describes more precisely the reason for the failure - checksum error, io error, or offline device, instead of the previous generic "insufficient devices". This is important for the next patches that add poisoning, as we only want to poison extents when we've got real checksum errors (or perhaps IO errors?) - not because a device was offline. - Add a new option and superblock field for the number of checksum retries. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 2 + fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/errcode.h | 4 +- fs/bcachefs/extents.c | 165 ++++++++++++++++++++-------------- fs/bcachefs/extents.h | 7 +- fs/bcachefs/extents_types.h | 11 +-- fs/bcachefs/io_read.c | 10 ++- fs/bcachefs/opts.h | 5 ++ fs/bcachefs/super-io.c | 4 + 9 files changed, 128 insertions(+), 82 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 7a5b0d211a82..e96d87767020 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); +/* one free bit */ LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); @@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); +LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index cd792fee7ee3..6abc9f17ea3c 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1355,7 +1355,7 @@ static void btree_node_read_work(struct work_struct *work) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; - bch2_mark_io_failure(&failed, &rb->pick); + bch2_mark_io_failure(&failed, &rb->pick, false); can_retry = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index afa16d58041e..493cae4efc37 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -273,7 +273,6 @@ x(EIO, stripe_reconstruct) \ x(EIO, key_type_error) \ x(EIO, extent_poisened) \ - x(EIO, no_device_to_read_from) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ @@ -283,6 +282,9 @@ x(EIO, ec_block_read) \ x(EIO, ec_block_write) \ x(EIO, data_read) \ + x(BCH_ERR_data_read, no_device_to_read_from) \ + x(BCH_ERR_data_read, data_read_io_err) \ + x(BCH_ERR_data_read, data_read_csum_err) \ x(BCH_ERR_data_read, data_read_retry) \ x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 032cd0bda017..04946d9911f5 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -58,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, } void bch2_mark_io_failure(struct bch_io_failures *failed, - struct extent_ptr_decoded *p) + struct extent_ptr_decoded *p, + bool csum_error) { struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); @@ -66,17 +67,16 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); f = &failed->devs[failed->nr++]; - f->dev = p->ptr.dev; - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else if (p->idx != f->idx) { - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else { - f->nr_failed++; + memset(f, 0, sizeof(*f)); + f->dev = p->ptr.dev; } + + if (p->do_ec_reconstruct) + f->failed_ec = true; + else if (!csum_error) + f->failed_io = true; + else + f->failed_csum_nr++; } static inline u64 dev_latency(struct bch_dev *ca) @@ -94,37 +94,30 @@ static inline int dev_failed(struct bch_dev *ca) */ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p1, - const struct extent_ptr_decoded p2) + u64 p1_latency, + struct bch_dev *ca1, + const struct extent_ptr_decoded p2, + u64 p2_latency) { - if (likely(!p1.idx && !p2.idx)) { - struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev); - struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - int failed_delta = dev_failed(ca1) - dev_failed(ca2); + int failed_delta = dev_failed(ca1) - dev_failed(ca2); + if (unlikely(failed_delta)) + return failed_delta < 0; - if (failed_delta) - return failed_delta < 0; + if (unlikely(bch2_force_reconstruct_read)) + return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - u64 l1 = dev_latency(ca1); - u64 l2 = dev_latency(ca2); + if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) + return p1.do_ec_reconstruct < p2.do_ec_reconstruct; - /* - * Square the latencies, to bias more in favor of the faster - * device - we never want to stop issuing reads to the slower - * device altogether, so that we can update our latency numbers: - */ - l1 *= l1; - l2 *= l2; + int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; + if (unlikely(crc_retry_delta)) + return crc_retry_delta < 0; - /* Pick at random, biased in favor of the faster device: */ + /* Pick at random, biased in favor of the faster device: */ - return bch2_get_random_u64_below(l1 + l2) > l1; - } - - if (bch2_force_reconstruct_read) - return p1.idx > p2.idx; - - return p1.idx < p2.idx; + return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; } /* @@ -133,73 +126,109 @@ static inline bool ptr_better(struct bch_fs *c, * other devices, it will still pick a pointer from avoid. */ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_failures *failed, - struct extent_ptr_decoded *pick, - int dev) + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick, + int dev) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_dev_io_failures *f; - int ret = 0; + bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; + bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) return -BCH_ERR_extent_poisened; rcu_read_lock(); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 pick_latency; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + have_dirty_ptrs |= !p.ptr.cached; + /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ if (p.ptr.unwritten) { - ret = 0; - break; + rcu_read_unlock(); + return 0; } /* Are we being asked to read from a specific device? */ if (dev >= 0 && p.ptr.dev != dev) continue; - /* - * If there are any dirty pointers it's an error if we can't - * read: - */ - if (!ret && !p.ptr.cached) - ret = -BCH_ERR_no_device_to_read_from; - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; - f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; - if (f) - p.idx = f->nr_failed < f->nr_retries - ? f->idx - : f->idx + 1; + struct bch_dev_io_failures *f = + unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; + if (unlikely(f)) { + p.crc_retry_nr = f->failed_csum_nr; + p.has_ec &= ~f->failed_ec; - if (!p.idx && (!ca || !bch2_dev_is_online(ca))) - p.idx++; + if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { + have_io_errors |= f->failed_io; + have_io_errors |= f->failed_ec; + } + have_csum_errors |= !!f->failed_csum_nr; - if (!p.idx && p.has_ec && bch2_force_reconstruct_read) - p.idx++; + if (p.has_ec && (f->failed_io || f->failed_csum_nr)) + p.do_ec_reconstruct = true; + else if (f->failed_io || + f->failed_csum_nr > c->opts.checksum_err_retry_nr) + continue; + } - if (p.idx > (unsigned) p.has_ec) - continue; + have_missing_devs |= ca && !bch2_dev_is_online(ca); - if (ret > 0 && !ptr_better(c, p, *pick)) - continue; + if (!ca || !bch2_dev_is_online(ca)) { + if (!p.has_ec) + continue; + p.do_ec_reconstruct = true; + } - *pick = p; - ret = 1; + if (bch2_force_reconstruct_read && p.has_ec) + p.do_ec_reconstruct = true; + + u64 p_latency = dev_latency(ca); + /* + * Square the latencies, to bias more in favor of the faster + * device - we never want to stop issuing reads to the slower + * device altogether, so that we can update our latency numbers: + */ + p_latency *= p_latency; + + if (!have_pick || + ptr_better(c, + p, p_latency, ca, + *pick, pick_latency)) { + *pick = p; + pick_latency = p_latency; + have_pick = true; + } } rcu_read_unlock(); - return ret; + if (have_pick) + return 1; + if (!have_dirty_ptrs) + return 0; + if (have_missing_devs) + return -BCH_ERR_no_device_to_read_from; + if (have_csum_errors) + return -BCH_ERR_data_read_csum_err; + if (have_io_errors) + return -BCH_ERR_data_read_io_err; + + WARN_ONCE(1, "unhandled error case in %s\n", __func__); + return -EINVAL; } /* KEY_TYPE_btree_ptr: */ diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index c50c4f353bab..e78a39e7e18f 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ({ \ __label__ out; \ \ - (_ptr).idx = 0; \ - (_ptr).has_ec = false; \ + (_ptr).has_ec = false; \ + (_ptr).do_ec_reconstruct = false; \ + (_ptr).crc_retry_nr = 0; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ switch (__extent_entry_type(_entry)) { \ @@ -401,7 +402,7 @@ out: \ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, bool); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *, int); diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index 43d6c341ecca..e51529dca4c2 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked { }; struct extent_ptr_decoded { - unsigned idx; bool has_ec; + bool do_ec_reconstruct; + u8 crc_retry_nr; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; struct bch_extent_stripe_ptr ec; @@ -31,10 +32,10 @@ struct bch_io_failures { u8 nr; struct bch_dev_io_failures { u8 dev; - u8 idx; - u8 nr_failed; - u8 nr_retries; - } devs[BCH_REPLICAS_MAX]; + unsigned failed_csum_nr:6, + failed_io:1, + failed_ec:1; + } devs[BCH_REPLICAS_MAX + 1]; }; #endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 887e3c9ac181..aecc645e6516 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -487,7 +487,8 @@ static void bch2_rbio_retry(struct work_struct *work) bvec_iter_sectors(rbio->bvec_iter)); if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) - bch2_mark_io_failure(&failed, &rbio->pick); + bch2_mark_io_failure(&failed, &rbio->pick, + rbio->ret == -BCH_ERR_data_read_retry_csum_err); if (!rbio->split) { rbio->bio.bi_status = 0; @@ -991,7 +992,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, ca && unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); - bch2_mark_io_failure(failed, &pick); + bch2_mark_io_failure(failed, &pick, false); percpu_ref_put(&ca->io_ref); goto retry_pick; } @@ -1154,7 +1155,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, else bch2_trans_unlock_long(trans); - if (!rbio->pick.idx) { + if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); @@ -1215,7 +1216,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio = bch2_rbio_free(rbio); if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) - bch2_mark_io_failure(failed, &pick); + bch2_mark_io_failure(failed, &pick, + ret == -BCH_ERR_data_read_retry_csum_err); return ret; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index afb89d318d24..baa9c11acb1a 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -186,6 +186,11 @@ enum fsck_err_opts { OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ + x(checksum_err_retry_nr, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, 32), \ + BCH_SB_CSUM_ERR_RETRY_NR, 3, \ + NULL, NULL) \ x(compression, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ee32d043414a..f87e3bf33ec0 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -457,6 +457,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags && + !BCH_SB_CSUM_ERR_RETRY_NR(sb)) + SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3); } #ifdef __KERNEL__ From 3fb8bacb14b6fb7a0040177bb7766f5c7bd68913 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Mar 2025 16:56:09 -0400 Subject: [PATCH 137/180] bcachefs: BCH_READ_data_update -> bch_read_bio.data_update Read flags are codepath dependent and change as they're passed around, while the fields in rbio._state are mostly fixed properties of that particular object. Losing track of BCH_READ_data_update would be bad, and previously it was not obvious if it was always correctly set in the rbio, so this is a safety cleanup. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 1 + fs/bcachefs/io_read.c | 57 ++++++++++++++++++++++----------------- fs/bcachefs/io_read.h | 20 +++++++------- fs/bcachefs/move.c | 1 - 4 files changed, 44 insertions(+), 35 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 522574bc4197..44b8ed3cc5d6 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -700,6 +700,7 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, } rbio_init(&m->rbio.bio, c, *io_opts, NULL); + m->rbio.data_update = true; m->rbio.bio.bi_iter.bi_size = buf_bytes; m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index aecc645e6516..a4a61bce076a 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -103,14 +103,21 @@ static inline bool have_io_error(struct bch_io_failures *failed) return failed && failed->nr; } -static bool ptr_being_rewritten(struct bch_read_bio *orig, - unsigned dev, - unsigned flags) +static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) { - if (!(flags & BCH_READ_data_update)) + EBUG_ON(rbio->split); + + return rbio->data_update + ? container_of(rbio, struct data_update, rbio) + : NULL; +} + +static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) +{ + struct data_update *u = rbio_data_update(orig); + if (!u) return false; - struct data_update *u = container_of(orig, struct data_update, rbio); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { @@ -199,7 +206,6 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, struct bpos pos, struct extent_ptr_decoded *pick, unsigned sectors, - unsigned flags, struct bch_read_bio *orig, struct bch_io_failures *failed) { @@ -220,7 +226,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { if (bch2_dev_io_failures(failed, ptr->dev) && - !ptr_being_rewritten(orig, ptr->dev, flags)) + !ptr_being_rewritten(orig, ptr->dev)) update_opts.rewrite_ptrs |= ptr_bit; ptr_bit <<= 1; } @@ -314,7 +320,7 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_reflink : BTREE_ID_extents, - k, pos, pick, sectors, flags, orig, failed); + k, pos, pick, sectors, orig, failed); if (!promote) return NULL; @@ -342,7 +348,7 @@ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *o if (ret) return ret; - if (rbio->flags & BCH_READ_data_update) + if (rbio->data_update) prt_str(out, "(internal move) "); return 0; @@ -505,7 +511,7 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_last_fragment; flags |= BCH_READ_must_clone; - int ret = flags & BCH_READ_data_update + int ret = rbio->data_update ? bch2_read_retry_nodecode(c, rbio, iter, &failed, flags) : __bch2_read(c, rbio, iter, inum, &failed, flags); @@ -519,7 +525,7 @@ static void bch2_rbio_retry(struct work_struct *work) bch2_inum_offset_err_msg_trans(trans, &buf, (subvol_inum) { subvol, read_pos.inode }, read_pos.offset << 9)); - if (rbio->flags & BCH_READ_data_update) + if (rbio->data_update) prt_str(&buf, "(internal move) "); prt_str(&buf, "successful retry"); @@ -712,9 +718,10 @@ static void __bch2_read_endio(struct work_struct *work) container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct bio *src = &rbio->bio; - struct bio *dst = &bch2_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_read_bio *parent = bch2_rbio_parent(rbio); + struct bio *src = &rbio->bio; + struct bio *dst = &parent->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); unsigned nofs_flags; @@ -764,7 +771,7 @@ static void __bch2_read_endio(struct work_struct *work) if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); - if (likely(!(rbio->flags & BCH_READ_data_update))) { + if (likely(!parent->data_update)) { /* Adjust crc to point to subset of data we want: */ crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); @@ -934,6 +941,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_read_bio *rbio = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); + struct data_update *u = rbio_data_update(orig); int ret = 0; if (bkey_extent_is_inline_data(k.k)) { @@ -997,7 +1005,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto retry_pick; } - if (!(flags & BCH_READ_data_update)) { + if (likely(!u)) { if (!(flags & BCH_READ_last_fragment) || bio_flagged(&orig->bio, BIO_CHAIN)) flags |= BCH_READ_must_clone; @@ -1020,12 +1028,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bounce = true; } } else { - read_full = true; /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - struct data_update *u = container_of(orig, struct data_update, rbio); if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) percpu_ref_put(&ca->io_ref); @@ -1034,6 +1040,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } iter.bi_size = pick.crc.compressed_size << 9; + read_full = true; } if (orig->opts.promote_target || have_io_error(failed)) @@ -1127,7 +1134,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); - if (!(flags & BCH_READ_data_update)) + if (!u) this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); else this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); @@ -1137,7 +1144,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update)) + if (ca && pick.ptr.cached && !u) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); @@ -1234,11 +1241,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], bvec_iter_sectors(iter)); /* - * won't normally happen in the BCH_READ_data_update - * (bch2_move_extent()) path, but if we retry and the extent we wanted - * to read no longer exists we have to signal that: + * won't normally happen in the data update (bch2_move_extent()) path, + * but if we retry and the extent we wanted to read no longer exists we + * have to signal that: */ - if (flags & BCH_READ_data_update) + if (u) orig->ret = -BCH_ERR_data_read_key_overwritten; zero_fill_bio_iter(&orig->bio, iter); @@ -1259,7 +1266,7 @@ int __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, struct bkey_s_c k; int ret; - BUG_ON(flags & BCH_READ_data_update); + EBUG_ON(rbio->data_update); bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 42a22985d789..559d8986d84c 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -35,7 +35,8 @@ struct bch_read_bio { u16 flags; union { struct { - u16 promote:1, + u16 data_update:1, + promote:1, bounce:1, split:1, have_ioref:1, @@ -108,7 +109,6 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, x(retry_if_stale) \ x(may_promote) \ x(user_mapped) \ - x(data_update) \ x(last_fragment) \ x(must_bounce) \ x(must_clone) \ @@ -161,12 +161,13 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, { struct bch_read_bio *rbio = to_rbio(bio); - rbio->c = orig->c; - rbio->_state = 0; - rbio->ret = 0; - rbio->split = true; - rbio->parent = orig; - rbio->opts = orig->opts; + rbio->c = orig->c; + rbio->_state = 0; + rbio->flags = 0; + rbio->ret = 0; + rbio->split = true; + rbio->parent = orig; + rbio->opts = orig->opts; return rbio; } @@ -180,7 +181,8 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, rbio->start_time = local_clock(); rbio->c = c; rbio->_state = 0; - rbio->ret = 0; + rbio->flags = 0; + rbio->ret = 0; rbio->opts = opts; rbio->bio.bi_end_io = end_io; return rbio; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 307b918fa2e7..10843a2ebb88 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -359,7 +359,6 @@ int bch2_move_extent(struct moving_context *ctxt, bkey_start_pos(k.k), iter->btree_id, k, 0, NULL, - BCH_READ_data_update| BCH_READ_last_fragment, data_opts.scrub ? data_opts.read_dev : -1); return 0; From 35de2abc22274689e78a6582c6b4439db30673ef Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Mar 2025 13:33:41 -0400 Subject: [PATCH 138/180] bcachefs: __bch2_read() now takes a btree_trans Next patch will be checking if the extent we're reading from matches the IO failure we saw before marking the failure. For this to work, __bch2_read() needs to take the same transaction context that bch2_rbio_retry() uses to do that check. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 30 ++++++++++++++---------------- fs/bcachefs/io_read.h | 12 +++++++----- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index a4a61bce076a..ce197b5bda9d 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -428,13 +428,13 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } -static noinline int bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) +static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) { struct data_update *u = container_of(rbio, struct data_update, rbio); - struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); @@ -470,8 +470,6 @@ static noinline int bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_b } BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); - bch2_trans_put(trans); - return ret; } @@ -487,6 +485,7 @@ static void bch2_rbio_retry(struct work_struct *work) .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; + struct btree_trans *trans = bch2_trans_get(c); trace_io_read_retry(&rbio->bio); this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], @@ -512,8 +511,8 @@ static void bch2_rbio_retry(struct work_struct *work) flags |= BCH_READ_must_clone; int ret = rbio->data_update - ? bch2_read_retry_nodecode(c, rbio, iter, &failed, flags) - : __bch2_read(c, rbio, iter, inum, &failed, flags); + ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) + : __bch2_read(trans, rbio, iter, inum, &failed, flags); if (ret) { rbio->ret = ret; @@ -521,7 +520,7 @@ static void bch2_rbio_retry(struct work_struct *work) } else { struct printbuf buf = PRINTBUF; - bch2_trans_do(c, + lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, (subvol_inum) { subvol, read_pos.inode }, read_pos.offset << 9)); @@ -534,6 +533,7 @@ static void bch2_rbio_retry(struct work_struct *work) } bch2_rbio_done(rbio); + bch2_trans_put(trans); } static void bch2_rbio_error(struct bch_read_bio *rbio, @@ -1256,11 +1256,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, return 0; } -int __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) +int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) { - struct btree_trans *trans = bch2_trans_get(c); + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; @@ -1357,9 +1357,7 @@ int __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, bch2_rbio_done(rbio); } - bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); - return ret; } diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 559d8986d84c..cd21950417f6 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -3,6 +3,7 @@ #define _BCACHEFS_IO_READ_H #include "bkey_buf.h" +#include "btree_iter.h" #include "reflink.h" struct bch_read_bio { @@ -140,7 +141,7 @@ static inline void bch2_read_extent(struct btree_trans *trans, data_btree, k, offset_into_extent, NULL, flags, -1); } -int __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, +int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, subvol_inum, struct bch_io_failures *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, @@ -150,10 +151,11 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, NULL, - BCH_READ_retry_if_stale| - BCH_READ_may_promote| - BCH_READ_user_mapped); + bch2_trans_run(c, + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, + BCH_READ_retry_if_stale| + BCH_READ_may_promote| + BCH_READ_user_mapped)); } static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, From 76bc6e51cd915a72fea346e5fd2b6d26d5dd8021 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Sat, 15 Mar 2025 15:39:42 +0800 Subject: [PATCH 139/180] bcachefs: Increase blacklist range Now there are 16 journal buffers, 8 is too small to be enough. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 71c786cdb192..a6e26733854d 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -899,7 +899,7 @@ int bch2_fs_recovery(struct bch_fs *c) * journal sequence numbers: */ if (!c->sb.clean) - journal_seq += 8; + journal_seq += JOURNAL_BUF_NR * 4; if (blacklist_seq != journal_seq) { ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", From fb8a9a32ccd2979b4ec77fde01cc585ff2835e55 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 15 Mar 2025 19:24:44 -0400 Subject: [PATCH 140/180] bcachefs: trace_io_move_write_fail Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 14 ++++++++++---- fs/bcachefs/move.c | 20 ++++++++++++++++---- fs/bcachefs/sb-counters_format.h | 1 + fs/bcachefs/trace.h | 5 +++++ 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 48befbae0226..a2e6b30530e3 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1742,20 +1742,26 @@ static const char * const bch2_write_flags[] = { void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) { - prt_str(out, "pos: "); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_printf(out, "pos:\t"); bch2_bpos_to_text(out, op->pos); prt_newline(out); printbuf_indent_add(out, 2); - prt_str(out, "started: "); + prt_printf(out, "started:\t"); bch2_pr_time_units(out, local_clock() - op->start_time); prt_newline(out); - prt_str(out, "flags: "); + prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); - prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); + prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); + prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); + + prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); printbuf_indent_sub(out, 2); } diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 10843a2ebb88..2d9ce7fb5818 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -101,13 +101,25 @@ static void move_free(struct moving_io *io) static void move_write_done(struct bch_write_op *op) { struct moving_io *io = container_of(op, struct moving_io, write.op); + struct bch_fs *c = op->c; struct moving_context *ctxt = io->write.ctxt; - if (io->write.op.error) - ctxt->write_error = true; + if (op->error) { + if (trace_io_move_write_fail_enabled()) { + struct printbuf buf = PRINTBUF; - atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); - atomic_dec(&io->write.ctxt->write_ios); + bch2_write_op_to_text(&buf, op); + prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error)); + trace_io_move_write_fail(c, buf.buf); + printbuf_exit(&buf); + } + this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); + + ctxt->write_error = true; + } + + atomic_sub(io->write_sectors, &ctxt->write_sectors); + atomic_dec(&ctxt->write_ios); move_free(io); closure_put(&ctxt->cl); } diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index c82a891026d3..fa27ec59a647 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -22,6 +22,7 @@ enum counters_flags { x(io_move_write, 36, TYPE_SECTORS) \ x(io_move_finish, 37, TYPE_SECTORS) \ x(io_move_fail, 38, TYPE_COUNTER) \ + x(io_move_write_fail, 82, TYPE_COUNTER) \ x(io_move_start_fail, 39, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index c8669a6b9cec..519d00d62ae7 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -846,6 +846,11 @@ DEFINE_EVENT(fs_str, io_move_fail, TP_ARGS(c, str) ); +DEFINE_EVENT(fs_str, io_move_write_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + DEFINE_EVENT(fs_str, io_move_start_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) From 9962cb77488f617963d0314e8c9120315d97ea18 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 15 Mar 2025 21:32:33 -0400 Subject: [PATCH 141/180] bcachefs: Improve can_write_extent() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes another "rebalance spinning and doing no work" issue; rebalance was reading extents it wanted to move, but then failing in bch2_write() -> bch2_alloc_sectors_start() due to being unable to allocate sufficient replicas. This was triggered by a user playing with the durability settings, the foreground device was an NVME device with durability=2, and originally he'd set the background device to durability=2 as well, but changed it back to 1 (the default) after seeing IO errors. That meant that with replicas=2, we want to move data off the NVME device which satisfies that constraint, but with a single durability=1 device on the background target there's no way to move the extent to that target while satisfiying the "required replicas" constraint. The solution for now is for bch2_data_update_init() to check for this, and return an error - before kicking off the read. bch2_data_update_init() already had two different checks for "will we be able to write this extent", with partially duplicated code, so this patch combines and improves that logic. Additionally, we now always bail out and return an error if there's insufficient space on the destination target. Previously, we only did this for BCH_WRITE_alloc_nowait moves, because it might be the case that copygc just needs to free up space on the destination target. But we really shouldn't kick off a move if the destination is full, we can't currently distinguish between "really full" and "just need to wait for copygc", and if we are going to wait on copygc it'd be better to do that before kicking off the move. This will additionally fix "rebalance spinning" issues caused by a filesystem that has more data than can fit in background_target - which is a valid scenario, since we don't exclude foreground/cache devices when calculating filesystem capacity. Reported-by: Maël Kerbiriou Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 103 +++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 57 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 44b8ed3cc5d6..08bb7f3019ce 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -638,40 +638,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } -static bool can_allocate_without_blocking(struct bch_fs *c, - struct data_update *m) -{ - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) - return false; - - unsigned target = m->op.flags & BCH_WRITE_only_specified_devs - ? m->op.target - : 0; - struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); - - darray_for_each(m->op.devs_have, i) - __clear_bit(*i, devs.d); - - rcu_read_lock(); - unsigned nr_replicas = 0, i; - for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { - struct bch_dev *ca = bch2_dev_rcu(c, i); - - struct bch_dev_usage usage; - bch2_dev_usage_read_fast(ca, &usage); - - if (!dev_buckets_free(ca, usage, m->op.watermark)) - continue; - - nr_replicas += ca->mi.durability; - if (nr_replicas >= m->op.nr_replicas) - break; - } - rcu_read_unlock(); - - return nr_replicas >= m->op.nr_replicas; -} - int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, struct bch_io_opts *io_opts) { @@ -707,16 +673,42 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, return 0; } -static bool can_write_extent(struct bch_fs *c, - struct bch_devs_list *devs_have, - unsigned target) +static int can_write_extent(struct bch_fs *c, struct data_update *m) { + if ((m->op.flags & BCH_WRITE_alloc_nowait) && + unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) + return -BCH_ERR_data_update_done_would_block; + + unsigned target = m->op.flags & BCH_WRITE_only_specified_devs + ? m->op.target + : 0; struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); - darray_for_each(*devs_have, i) + darray_for_each(m->op.devs_have, i) __clear_bit(*i, devs.d); - return !bch2_is_zero(&devs, sizeof(devs)); + rcu_read_lock(); + unsigned nr_replicas = 0, i; + for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { + struct bch_dev *ca = bch2_dev_rcu(c, i); + + struct bch_dev_usage usage; + bch2_dev_usage_read_fast(ca, &usage); + + if (!dev_buckets_free(ca, usage, m->op.watermark)) + continue; + + nr_replicas += ca->mi.durability; + if (nr_replicas >= m->op.nr_replicas) + break; + } + rcu_read_unlock(); + + if (!nr_replicas) + return -BCH_ERR_data_update_done_no_rw_devs; + if (nr_replicas < m->op.nr_replicas) + return -BCH_ERR_insufficient_devices; + return 0; } int bch2_data_update_init(struct btree_trans *trans, @@ -800,20 +792,6 @@ int bch2_data_update_init(struct btree_trans *trans, ptr_bit <<= 1; } - if (!can_write_extent(c, &m->op.devs_have, - m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target : 0)) { - /* - * Check if we have rw devices not in devs_have: this can happen - * if we're trying to move data on a ro or failed device - * - * If we can't move it, we need to clear the rebalance_work bit, - * if applicable - * - * Also, copygc should skip ro/failed devices: - */ - return -BCH_ERR_data_update_done_no_rw_devs; - } - unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); /* @@ -853,11 +831,22 @@ int bch2_data_update_init(struct btree_trans *trans, goto out_bkey_buf_exit; } - if ((m->op.flags & BCH_WRITE_alloc_nowait) && - !can_allocate_without_blocking(c, m)) { - ret = -BCH_ERR_data_update_done_would_block; + /* + * Check if the allocation will succeed, to avoid getting an error later + * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless + * read: + * + * This guards against + * - BCH_WRITE_alloc_nowait allocations failing (promotes) + * - Destination target full + * - Device(s) in destination target offline + * - Insufficient durability available in destination target + * (i.e. trying to move a durability=2 replica to a target with a + * single durability=2 device) + */ + ret = can_write_extent(c, m); + if (ret) goto out_bkey_buf_exit; - } if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, From 9b39835e932e94a3cef3d02c08a5b1df585c74bd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 16 Mar 2025 13:39:14 -0400 Subject: [PATCH 142/180] bcachefs: #if 0 out (enable|disable)_encryption() These weren't hooked up, but they probably should be - add some comments for context. Signed-off-by: Kent Overstreet --- fs/bcachefs/checksum.c | 13 +++++++++++++ fs/bcachefs/checksum.h | 2 ++ 2 files changed, 15 insertions(+) diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 23a383577d4c..15de9d794337 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c) return 0; } +#if 0 + +/* + * This seems to be duplicating code in cmd_remove_passphrase() in + * bcachefs-tools, but we might want to switch userspace to use this - and + * perhaps add an ioctl for calling this at runtime, so we can take the + * passphrase off of a mounted filesystem (which has come up). + */ int bch2_disable_encryption(struct bch_fs *c) { struct bch_sb_field_crypt *crypt; @@ -725,6 +733,10 @@ int bch2_disable_encryption(struct bch_fs *c) return ret; } +/* + * For enabling encryption on an existing filesystem: not hooked up yet, but it + * should be + */ int bch2_enable_encryption(struct bch_fs *c, bool keyed) { struct bch_encrypted_key key; @@ -781,6 +793,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) memzero_explicit(&key, sizeof(key)); return ret; } +#endif void bch2_fs_encryption_exit(struct bch_fs *c) { diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 43b9d71f2f2b..4ac251c8fcd8 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, struct bch_key *); +#if 0 int bch2_disable_encryption(struct bch_fs *); int bch2_enable_encryption(struct bch_fs *, bool); +#endif void bch2_fs_encryption_exit(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *); From 39abc73b595587180ce4d57c4ca56a52ca796fc2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 15 Mar 2025 20:03:19 -0700 Subject: [PATCH 143/180] bcachefs: Remove unnecessary softdeps on crc32c and crc64 Since bcachefs does not access crc32c and crc64 through the crypto API, there is no need to use module softdeps to ensure they are loaded. Signed-off-by: Eric Biggers Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 78a8daa80fcc..6de3a5751561 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -75,8 +75,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet "); MODULE_DESCRIPTION("bcachefs filesystem"); -MODULE_SOFTDEP("pre: crc32c"); -MODULE_SOFTDEP("pre: crc64"); MODULE_SOFTDEP("pre: sha256"); MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); From 71fbb0b86e719cb84524c7f8904c0c2e5cdc2697 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 15 Mar 2025 20:47:17 -0700 Subject: [PATCH 144/180] bcachefs: use sha256() instead of crypto_shash API Just use sha256() instead of the clunky crypto API. This is much simpler. Signed-off-by: Eric Biggers Signed-off-by: Kent Overstreet --- fs/bcachefs/Kconfig | 2 +- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/checksum.c | 10 ---------- fs/bcachefs/str_hash.h | 8 ++------ fs/bcachefs/super.c | 1 - 5 files changed, 3 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index fc7efd0a7525..c9798750202d 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -16,7 +16,7 @@ config BCACHEFS_FS select ZSTD_COMPRESS select ZSTD_DECOMPRESS select CRYPTO - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select CRYPTO_CHACHA20 select CRYPTO_POLY1305 select KEYS diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b432bb6e6f6e..0ea593e813f4 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -979,7 +979,6 @@ struct bch_fs { mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; - struct crypto_shash *sha256; struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 15de9d794337..7f9e4c59950c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -801,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c) crypto_free_shash(c->poly1305); if (c->chacha20) crypto_free_sync_skcipher(c->chacha20); - if (c->sha256) - crypto_free_shash(c->sha256); } int bch2_fs_encryption_init(struct bch_fs *c) @@ -811,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c) struct bch_key key; int ret = 0; - c->sha256 = crypto_alloc_shash("sha256", 0, 0); - ret = PTR_ERR_OR_ZERO(c->sha256); - if (ret) { - c->sha256 = NULL; - bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); - goto out; - } - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) goto out; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index f645a4547b04..575ad1e03904 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -12,7 +12,6 @@ #include "super.h" #include -#include #include static inline enum bch_str_hash_type @@ -55,13 +54,10 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) }; if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { - SHASH_DESC_ON_STACK(desc, c->sha256); u8 digest[SHA256_DIGEST_SIZE]; - desc->tfm = c->sha256; - - crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, - sizeof(bi->bi_hash_seed), digest); + sha256((const u8 *)&bi->bi_hash_seed, + sizeof(bi->bi_hash_seed), digest); memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6de3a5751561..8e928b3d8ef4 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -75,7 +75,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet "); MODULE_DESCRIPTION("bcachefs filesystem"); -MODULE_SOFTDEP("pre: sha256"); MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: xxhash"); From 6aa446c05a44ddc46b0aea510a95d288e993daec Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 15 Mar 2025 17:27:27 -0400 Subject: [PATCH 145/180] bcachefs: Fix offset_into_extent in data move path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following: [ 17.607394] kernel BUG at fs/bcachefs/reflink.c:261! [ 17.608316] Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 17.608485] CPU: 0 UID: 0 PID: 564 Comm: bch-rebalance/3 Tainted: G OE 6.14.0-rc6-arch1-gfcb0bd9609d2 #7 0efd7a8f4a00afeb2c5fb6e7ecb1aec8ddcbb1e1 [ 17.608616] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE [ 17.608736] Hardware name: Micro-Star International Co., Ltd. MS-7D75/MAG B650 TOMAHAWK WIFI (MS-7D75), BIOS 1.74 08/01/2023 [ 17.608855] RIP: 0010:bch2_lookup_indirect_extent+0x252/0x290 [bcachefs] [ 17.609006] Code: 00 00 00 00 e8 7f 51 f5 ff 89 c3 85 c0 74 52 48 8b 7d b0 4c 89 ee e8 4d 4b f4 ff 48 63 d3 48 89 d0 31 d2 e9 2e ff ff ff 0f 0b <0f> 0b 48 8b 7d b0 4c 89 ee 48 89 55 a8 e8 2c 4b f4 ff 4c 8b 55 a8 [ 17.609136] RSP: 0018:ffffa3714455f850 EFLAGS: 00010246 [ 17.609261] RAX: 0000000000000080 RBX: ffff895891098790 RCX: 0000000000000000 [ 17.609387] RDX: 0000000000000080 RSI: ffffa3714455fa90 RDI: ffff895889550000 [ 17.609511] RBP: ffffa3714455f8c0 R08: ffff895891098790 R09: 0000000000000001 [ 17.609637] R10: ffffa3714455f8d8 R11: ffffa3714455f950 R12: ffffa3714455fa58 [ 17.609763] R13: ffff895891098790 R14: ffffa3714455fa58 R15: ffff895889550000 [ 17.609888] FS: 0000000000000000(0000) GS:ffff896757c00000(0000) knlGS:0000000000000000 [ 17.610015] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 17.610143] CR2: 0000716b8cda2750 CR3: 0000000914e22000 CR4: 0000000000f50ef0 [ 17.610272] PKRU: 55555554 [ 17.610403] Call Trace: [ 17.610535] [ 17.610662] ? __die_body.cold+0x19/0x27 [ 17.610791] ? die+0x2e/0x50 [ 17.610918] ? do_trap+0xca/0x110 [ 17.611049] ? do_error_trap+0x6a/0x90 [ 17.611178] ? bch2_lookup_indirect_extent+0x252/0x290 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.611331] ? exc_invalid_op+0x50/0x70 [ 17.611468] ? bch2_lookup_indirect_extent+0x252/0x290 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.611620] ? asm_exc_invalid_op+0x1a/0x20 [ 17.611757] ? bch2_lookup_indirect_extent+0x252/0x290 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.611911] ? bch2_move_data_btree+0x58a/0x6c0 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.612084] bch2_move_data_btree+0x58a/0x6c0 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.612256] ? __pfx_rebalance_pred+0x10/0x10 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.612431] ? bch2_move_extent+0x3d7/0x6e0 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.612607] ? __bch2_move_data+0xea/0x200 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.612782] __bch2_move_data+0xea/0x200 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.612959] ? __pfx_rebalance_pred+0x10/0x10 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.613149] do_rebalance+0x517/0x8d0 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.613342] ? local_clock_noinstr+0xd/0xd0 [ 17.613518] ? local_clock+0x15/0x30 [ 17.613693] ? __bch2_trans_get+0x152/0x300 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.613890] ? __pfx_bch2_rebalance_thread+0x10/0x10 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] [ 17.614090] bch2_rebalance_thread+0x66/0xb0 [bcachefs c42b95c23facdfe11d39755520127cd771dddec2] The offset_into_extent bit was copied from the read path, but it's unnecessary here, where we always want to read and move the entire indirect extent, and it causes the assertion pop - because we're using a non-extents iterator, which always points to the end of the reflink pointer. Reported-by: Maël Kerbiriou Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 2d9ce7fb5818..55e17c2d8e5a 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -591,7 +591,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, k.k->type == KEY_TYPE_reflink_p && REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + s64 offset_into_extent = 0; bch2_trans_iter_exit(trans, &reflink_iter); k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); @@ -610,6 +610,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, * pointer - need to fixup iter->k */ extent_iter = &reflink_iter; + offset_into_extent = 0; } if (!bkey_extent_is_direct_data(k.k)) From 2eb985c54954f1e4203f0d72cb0a6fe15d0958b0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 15 Mar 2025 19:57:20 -0400 Subject: [PATCH 146/180] bcachefs: Better incompat version/feature error messages If we can't mount because of an incompatibility, print what's supported and unsupported - to help solve PEBKAC issues. Reported-by: Roland Vet Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index f87e3bf33ec0..7dcabc9676f4 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -377,15 +377,27 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (ret) return ret; - if (sb->features[1] || - (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { - prt_printf(out, "Filesystem has incompatible features"); + u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); + unsigned incompat_bit = 0; + if (incompat) + incompat_bit = __ffs64(incompat); + else if (sb->features[1]) + incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); + + if (incompat_bit) { + prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", + incompat_bit, + bch2_sb_features[BCH_FEATURE_NR - 1], + BCH_FEATURE_NR - 1); return -BCH_ERR_invalid_sb_features; } if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { - prt_printf(out, "Filesystem has incompatible version"); + prt_str(out, "Filesystem has incompatible version "); + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_str(out, ", current version "); + bch2_version_to_text(out, bcachefs_metadata_version_current); return -BCH_ERR_invalid_sb_features; } From 5e67243ea670c61d6e59eaf358b74991e45c7b16 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Mar 2025 11:28:26 -0400 Subject: [PATCH 147/180] bcachefs: Add missing random.h includes Fix build in userspace, and good hygeine. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index ce197b5bda9d..f1503df57dc7 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -25,6 +25,7 @@ #include "subvolume.h" #include "trace.h" +#include #include #ifdef CONFIG_BCACHEFS_DEBUG From 8bd875ae47cf9138816e97578cda017374b715b5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Mar 2025 10:54:21 -0400 Subject: [PATCH 148/180] bcachefs: bch2_sb_validate() doesn't need bch_sb_handle Minor refactoring, so that bch2_sb_validate() can be used in the new userspace superblock recovery tool. Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 8 +++----- fs/bcachefs/super-io.h | 2 ++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 7dcabc9676f4..2e86a04b49e0 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -365,10 +365,8 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - enum bch_validate_flags flags, struct printbuf *out) +int bch2_sb_validate(struct bch_sb *sb, enum bch_validate_flags flags, struct printbuf *out) { - struct bch_sb *sb = disk_sb->sb; struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; int ret; @@ -890,7 +888,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, sb->have_layout = true; - ret = bch2_sb_validate(sb, 0, &err); + ret = bch2_sb_validate(sb->sb, 0, &err); if (ret) { bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); @@ -1047,7 +1045,7 @@ int bch2_write_super(struct bch_fs *c) darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); + ret = bch2_sb_validate((*ca)->disk_sb.sb, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); goto out; diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 167dd98f893e..41562380a353 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); +int bch2_sb_validate(struct bch_sb *, enum bch_validate_flags, struct printbuf *); + int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); From 92c7789a9ed892da60cb5da2bcb6278551e2eb34 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Mar 2025 13:58:51 -0400 Subject: [PATCH 149/180] bcachefs: Validate bch_sb.offset field This was missed - but it needs to be correct for the superblock recovery tool that scans the start and end of the device for backup superblocks: we don't want to pick up superblocks that belong to a different partition that starts at a different offset. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 + fs/bcachefs/super-io.c | 14 +++++++++++--- fs/bcachefs/super-io.h | 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 493cae4efc37..cb27de6ffad6 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -231,6 +231,7 @@ x(BCH_ERR_invalid_sb, invalid_sb_csum) \ x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ + x(BCH_ERR_invalid_sb, invalid_sb_offset) \ x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 2e86a04b49e0..f2e4428281a3 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -365,7 +365,8 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -int bch2_sb_validate(struct bch_sb *sb, enum bch_validate_flags flags, struct printbuf *out) +int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, + enum bch_validate_flags flags, struct printbuf *out) { struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; @@ -409,6 +410,13 @@ int bch2_sb_validate(struct bch_sb *sb, enum bch_validate_flags flags, struct pr return -BCH_ERR_invalid_sb_uuid; } + if (!(flags & BCH_VALIDATE_write) && + le64_to_cpu(sb->offset) != read_offset) { + prt_printf(out, "Bad sb offset (got %llu, read from %llu)", + le64_to_cpu(sb->offset), read_offset); + return -BCH_ERR_invalid_sb_offset; + } + if (!sb->nr_devices || sb->nr_devices > BCH_SB_MEMBERS_MAX) { prt_printf(out, "Bad number of member devices %u (max %u)", @@ -888,7 +896,7 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, sb->have_layout = true; - ret = bch2_sb_validate(sb->sb, 0, &err); + ret = bch2_sb_validate(sb->sb, offset, 0, &err); if (ret) { bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); @@ -1045,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c) darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate((*ca)->disk_sb.sb, BCH_VALIDATE_write, &err); + ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); goto out; diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 41562380a353..78f708a6fbcd 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -92,7 +92,7 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); -int bch2_sb_validate(struct bch_sb *, enum bch_validate_flags, struct printbuf *); +int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *); int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); From 16a8d5d00b8add42924185c30f7cefdbb156fdd4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Mar 2025 15:07:06 -0400 Subject: [PATCH 150/180] bcachefs: Fix btree iter flags in data move MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rebalance requires a not_extents iterator. This wasn't hit before because all_snapshots disableds is_extents on snapshots btrees - but has no effect on the reflink btree. Reported-by: Maël Kerbiriou Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 55e17c2d8e5a..8fcdc6984f6e 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -561,6 +561,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, btree_id, start, BTREE_ITER_prefetch| + BTREE_ITER_not_extents| BTREE_ITER_all_snapshots); if (ctxt->rate) From 5cc0ab39fb16c44ef6bbabb1b5a0c5705ec0bb56 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 18 Mar 2025 01:54:24 +0800 Subject: [PATCH 151/180] bcachefs: Fix incorrect state count atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR is the condition in journal_entry_open where we return JOURNAL_ERR_max_open, so journal_cur_seq(j) - seq == JOURNAL_STATE_BUF_NR means that the buf corresponding to seq has started to write. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 1c460ded2a11..ab68c5c4d8d8 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -161,7 +161,7 @@ static inline int journal_state_count(union journal_res_state s, int idx) static inline int journal_state_seq_count(struct journal *j, union journal_res_state s, u64 seq) { - if (journal_cur_seq(j) - seq <= JOURNAL_STATE_BUF_NR) + if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR) return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); else return 0; From dd7ae389ff84acb4f332f3fa614d15a3e1f2087f Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 18 Mar 2025 15:50:01 +0800 Subject: [PATCH 152/180] bcachefs: Remove spurious smp_mb() The smp_mb() is paired with nothing. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 8d4f3bfaa228..fc15644c9f24 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -761,7 +761,6 @@ void bch2_journal_entry_res_resize(struct journal *j, goto out; j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); - smp_mb(); state = READ_ONCE(j->reservations); if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && From d2bad59255dcc1959c8f931fcf85775c53e06d9a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 12:05:50 -0400 Subject: [PATCH 153/180] bcachefs: Kill BCH_DEV_OPT_SETTERS() Previously, device options had their superblock option field listed separately, which was weird and easy to miss when defining options. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.c | 108 +++++++++++++++----------------- fs/bcachefs/opts.h | 50 +++++++-------- fs/bcachefs/sb-members_format.h | 1 + fs/bcachefs/super-io.c | 8 +-- 4 files changed, 80 insertions(+), 87 deletions(-) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 6772faf385a5..ae47345f93c1 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -163,16 +163,6 @@ const char * const bch2_d_types[BCH_DT_MAX] = { [DT_SUBVOL] = "subvol", }; -u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) -{ - BUG(); -} - -void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) -{ - BUG(); -} - void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) { #define x(_name, ...) \ @@ -223,6 +213,21 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) } } +/* dummy option, for options that aren't stored in the superblock */ +typedef u64 (*sb_opt_get_fn)(const struct bch_sb *); +typedef void (*sb_opt_set_fn)(struct bch_sb *, u64); +typedef u64 (*member_opt_get_fn)(const struct bch_member *); +typedef void (*member_opt_set_fn)(struct bch_member *, u64); + +__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL; +__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL; +__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL; +__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL; + +#define type_compatible_or_null(_p, _type) \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL) + const struct bch_option bch2_opt_table[] = { #define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ @@ -239,15 +244,15 @@ const struct bch_option bch2_opt_table[] = { #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ [Opt_##_name] = { \ - .attr = { \ - .name = #_name, \ - .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ - }, \ - .flags = _flags, \ - .hint = _hint, \ - .help = _help, \ - .get_sb = _sb_opt, \ - .set_sb = SET_##_sb_opt, \ + .attr.name = #_name, \ + .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ + .flags = _flags, \ + .hint = _hint, \ + .help = _help, \ + .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \ + .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \ + .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \ + .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\ _type \ }, @@ -495,12 +500,8 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) int bch2_opts_check_may_set(struct bch_fs *c) { - unsigned i; - int ret; - - for (i = 0; i < bch2_opts_nr; i++) { - ret = bch2_opt_check_may_set(c, i, - bch2_opt_get_by_id(&c->opts, i)); + for (unsigned i = 0; i < bch2_opts_nr; i++) { + int ret = bch2_opt_check_may_set(c, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } @@ -619,12 +620,25 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, return ret; } -u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) +u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx) { const struct bch_option *opt = bch2_opt_table + id; u64 v; - v = opt->get_sb(sb); + if (dev_idx < 0) { + v = opt->get_sb(sb); + } else { + if (WARN(!bch2_member_exists(sb, dev_idx), + "tried to set device option %s on nonexistent device %i", + opt->attr.name, dev_idx)) + return 0; + + struct bch_member m = bch2_sb_member_get(sb, dev_idx); + v = opt->get_member(&m); + } + + if (opt->flags & OPT_SB_FIELD_ONE_BIAS) + --v; if (opt->flags & OPT_SB_FIELD_ILOG2) v = 1ULL << v; @@ -641,35 +655,19 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) */ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) { - unsigned id; - - for (id = 0; id < bch2_opts_nr; id++) { + for (unsigned id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; - if (opt->get_sb == BCH2_NO_SB_OPT) - continue; - - bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); + if (opt->get_sb) + bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1)); } return 0; } -struct bch_dev_sb_opt_set { - void (*set_sb)(struct bch_member *, u64); -}; - -static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = { -#define x(n, set) [Opt_##n] = { .set_sb = SET_##set }, - BCH_DEV_OPT_SETTERS() -#undef x -}; - void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, const struct bch_option *opt, u64 v) { - enum bch_opt_id id = opt - bch2_opt_table; - if (opt->flags & OPT_SB_FIELD_SECTORS) v >>= 9; @@ -679,24 +677,18 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, if (opt->flags & OPT_SB_FIELD_ONE_BIAS) v++; - if (opt->flags & OPT_FS) { - if (opt->set_sb != SET_BCH2_NO_SB_OPT) - opt->set_sb(sb, v); - } + if ((opt->flags & OPT_FS) && opt->set_sb) + opt->set_sb(sb, v); - if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) { + if ((opt->flags & OPT_DEVICE) && + opt->set_member && + dev_idx >= 0) { if (WARN(!bch2_member_exists(sb, dev_idx), "tried to set device option %s on nonexistent device %i", opt->attr.name, dev_idx)) return; - struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); - - const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id; - if (set->set_sb) - set->set_sb(m, v); - else - pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name); + opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v); } } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index baa9c11acb1a..c0adfd5b4f1c 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -50,10 +50,6 @@ static inline const char *bch2_d_type_str(unsigned d_type) * apply the options from that struct that are defined. */ -/* dummy option, for options that aren't stored in the superblock */ -u64 BCH2_NO_SB_OPT(const struct bch_sb *); -void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); - /* When can be set: */ enum opt_flags { OPT_FS = BIT(0), /* Filesystem option */ @@ -318,11 +314,6 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don't kick drives out when splitbrain detected")\ - x(discard, u8, \ - OPT_FS|OPT_MOUNT|OPT_DEVICE, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable discard/TRIM support") \ x(verbose, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -503,27 +494,37 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, false, \ NULL, "Skip submit_bio() for data reads and writes, " \ "for performance testing purposes") \ + x(state, u64, \ + OPT_DEVICE, \ + OPT_STR(bch2_member_states), \ + BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ + "state", "rw,ro,failed,spare") \ x(fs_size, u64, \ - OPT_DEVICE, \ + OPT_DEVICE|OPT_HIDDEN, \ OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, 0, \ + BCH2_NO_MEMBER_OPT, 0, \ "size", "Size of filesystem on device") \ - x(bucket, u32, \ - OPT_DEVICE, \ + x(bucket_size, u32, \ + OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, 0, \ + BCH_MEMBER_BUCKET_SIZE, 0, \ "size", "Specifies the bucket size; must be greater than the btree node size")\ x(durability, u8, \ - OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ + OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ - BCH2_NO_SB_OPT, 1, \ + BCH_MEMBER_DURABILITY, 1, \ "n", "Data written to this device will be considered\n"\ "to have already been replicated n times") \ x(data_allowed, u8, \ OPT_DEVICE, \ OPT_BITFIELD(__bch2_data_types), \ - BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ + BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ "types", "Allowed data types for this device: journal, btree, and/or user")\ + x(discard, u8, \ + OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_MEMBER_DISCARD, true, \ + NULL, "Enable discard/TRIM support") \ x(btree_node_prefetch, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -531,11 +532,6 @@ enum fsck_err_opts { NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ " prefetched sequentially") -#define BCH_DEV_OPT_SETTERS() \ - x(discard, BCH_MEMBER_DISCARD) \ - x(durability, BCH_MEMBER_DURABILITY) \ - x(data_allowed, BCH_MEMBER_DATA_ALLOWED) - struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; BCH_OPTS() @@ -592,8 +588,6 @@ struct printbuf; struct bch_option { struct attribute attr; - u64 (*get_sb)(const struct bch_sb *); - void (*set_sb)(struct bch_sb *, u64); enum opt_type type; enum opt_flags flags; u64 min, max; @@ -605,6 +599,12 @@ struct bch_option { const char *hint; const char *help; + u64 (*get_sb)(const struct bch_sb *); + void (*set_sb)(struct bch_sb *, u64); + + u64 (*get_member)(const struct bch_member *); + void (*set_member)(struct bch_member *, u64); + }; extern const struct bch_option bch2_opt_table[]; @@ -613,7 +613,7 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); -u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); +u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 2adf1221a440..3affec823b3f 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -79,6 +79,7 @@ struct bch_member { #define BCH_MEMBER_V1_BYTES 56 +LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16) LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) /* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index f2e4428281a3..572b06bfa0b8 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -489,8 +489,8 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { const struct bch_option *opt = bch2_opt_table + opt_id; - if (opt->get_sb != BCH2_NO_SB_OPT) { - u64 v = bch2_opt_from_sb(sb, opt_id); + if (opt->get_sb) { + u64 v = bch2_opt_from_sb(sb, opt_id, -1); prt_printf(out, "Invalid option "); ret = bch2_opt_validate(opt, v, out); @@ -1473,8 +1473,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, for (id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; - if (opt->get_sb != BCH2_NO_SB_OPT) { - u64 v = bch2_opt_from_sb(sb, id); + if (opt->get_sb) { + u64 v = bch2_opt_from_sb(sb, id, -1); prt_printf(out, "%s:\t", opt->attr.name); bch2_opt_to_text(out, NULL, sb, opt, v, From 8b294a9b5c14473d6d7e35756d201922ba785042 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Mar 2025 18:44:25 -0400 Subject: [PATCH 154/180] bcachefs: Device options now use standard sysfs code Device options now use the common code for sysfs, and can superblock fields (in a struct bch_member). This replaces BCH_DEV_OPT_SETTERS(), which was weird and easy to miss. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.c | 4 +- fs/bcachefs/opts.h | 2 +- fs/bcachefs/super.c | 6 +-- fs/bcachefs/sysfs.c | 113 +++++++++++++++++++++++--------------------- fs/bcachefs/sysfs.h | 5 +- fs/bcachefs/xattr.c | 2 +- 6 files changed, 69 insertions(+), 63 deletions(-) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index ae47345f93c1..de24af773224 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -480,7 +480,7 @@ void bch2_opts_to_text(struct printbuf *out, } } -int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) +int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) { int ret = 0; @@ -501,7 +501,7 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) int bch2_opts_check_may_set(struct bch_fs *c) { for (unsigned i = 0; i < bch2_opts_nr; i++) { - int ret = bch2_opt_check_may_set(c, i, bch2_opt_get_by_id(&c->opts, i)); + int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index c0adfd5b4f1c..19fcc0e51c0b 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -635,7 +635,7 @@ void bch2_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); -int bch2_opt_check_may_set(struct bch_fs *, int, u64); +int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64); int bch2_opts_check_may_set(struct bch_fs *); int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 8e928b3d8ef4..d662adfbdbcc 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -715,7 +715,7 @@ static int bch2_fs_online(struct bch_fs *c) kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: #endif kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: - bch2_opts_create_sysfs_files(&c->opts_dir); + bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); if (ret) { bch_err(c, "error creating sysfs objects"); return ret; @@ -1297,8 +1297,8 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) return 0; if (!ca->kobj.state_in_sysfs) { - ret = kobject_add(&ca->kobj, &c->kobj, - "dev-%u", ca->dev_idx); + ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: + bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); if (ret) return ret; } diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 2ed3f755eadd..4c5b585041be 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -152,10 +152,8 @@ read_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); read_attribute(flags); -read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); -rw_attribute(durability); read_attribute(io_done); read_attribute(io_errors); write_attribute(io_errors_reset); @@ -208,8 +206,6 @@ read_attribute(usage_base); BCH_PERSISTENT_COUNTERS() #undef x -rw_attribute(discard); -read_attribute(state); rw_attribute(label); read_attribute(copy_gc_wait); @@ -599,26 +595,34 @@ struct attribute *bch2_fs_internal_files[] = { /* options */ -SHOW(bch2_fs_opts_dir) +static ssize_t sysfs_opt_show(struct bch_fs *c, + struct bch_dev *ca, + enum bch_opt_id id, + struct printbuf *out) { - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - const struct bch_option *opt = container_of(attr, struct bch_option, attr); - int id = opt - bch2_opt_table; - u64 v = bch2_opt_get_by_id(&c->opts, id); + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + + if (opt->flags & OPT_FS) { + v = bch2_opt_get_by_id(&c->opts, id); + } else if ((opt->flags & OPT_DEVICE) && opt->get_member) { + v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx); + } else { + return -EINVAL; + } bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); prt_char(out, '\n'); - return 0; } -STORE(bch2_fs_opts_dir) +static ssize_t sysfs_opt_store(struct bch_fs *c, + struct bch_dev *ca, + enum bch_opt_id id, + const char *buf, size_t size) { - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - const struct bch_option *opt = container_of(attr, struct bch_option, attr); - int ret, id = opt - bch2_opt_table; - char *tmp; - u64 v; + const struct bch_option *opt = bch2_opt_table + id; + int ret = 0; /* * We don't need to take c->writes for correctness, but it eliminates an @@ -627,23 +631,21 @@ STORE(bch2_fs_opts_dir) if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) return -EROFS; - tmp = kstrdup(buf, GFP_KERNEL); + char *tmp = kstrdup(buf, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto err; } - ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); + u64 v; + ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: + bch2_opt_check_may_set(c, ca, id, v); kfree(tmp); if (ret < 0) goto err; - ret = bch2_opt_check_may_set(c, id, v); - if (ret < 0) - goto err; - - bch2_opt_set_sb(c, NULL, opt, v); + bch2_opt_set_sb(c, ca, opt, v); bch2_opt_set_by_id(&c->opts, id, v); if (v && @@ -664,22 +666,41 @@ STORE(bch2_fs_opts_dir) bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return ret; } + +SHOW(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + int id = bch2_opt_lookup(attr->name); + if (id < 0) + return 0; + + return sysfs_opt_show(c, NULL, id, out); +} + +STORE(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + int id = bch2_opt_lookup(attr->name); + if (id < 0) + return 0; + + return sysfs_opt_store(c, NULL, id, buf, size); +} SYSFS_OPS(bch2_fs_opts_dir); struct attribute *bch2_fs_opts_dir_files[] = { NULL }; -int bch2_opts_create_sysfs_files(struct kobject *kobj) +int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) { - const struct bch_option *i; - int ret; - - for (i = bch2_opt_table; + for (const struct bch_option *i = bch2_opt_table; i < bch2_opt_table + bch2_opts_nr; i++) { - if (!(i->flags & OPT_FS)) + if (i->flags & OPT_HIDDEN) + continue; + if (!(i->flags & type)) continue; - ret = sysfs_create_file(kobj, &i->attr); + int ret = sysfs_create_file(kobj, &i->attr); if (ret) return ret; } @@ -750,11 +771,8 @@ SHOW(bch2_dev) sysfs_printf(uuid, "%pU\n", ca->uuid.b); - sysfs_print(bucket_size, bucket_bytes(ca)); sysfs_print(first_bucket, ca->mi.first_bucket); sysfs_print(nbuckets, ca->mi.nbuckets); - sysfs_print(durability, ca->mi.durability); - sysfs_print(discard, ca->mi.discard); if (attr == &sysfs_label) { if (ca->mi.group) @@ -767,11 +785,6 @@ SHOW(bch2_dev) prt_char(out, '\n'); } - if (attr == &sysfs_state) { - prt_string_option(out, bch2_member_states, ca->mi.state); - prt_char(out, '\n'); - } - if (attr == &sysfs_io_done) dev_io_done_to_text(out, ca); @@ -797,6 +810,10 @@ SHOW(bch2_dev) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, ca); + int opt_id = bch2_opt_lookup(attr->name); + if (opt_id >= 0) + return sysfs_opt_show(c, ca, opt_id, out); + return 0; } @@ -805,18 +822,6 @@ STORE(bch2_dev) struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; - if (attr == &sysfs_discard) { - bool v = strtoul_or_return(buf); - - bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v); - } - - if (attr == &sysfs_durability) { - u64 v = strtoul_or_return(buf); - - bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v); - } - if (attr == &sysfs_label) { char *tmp; int ret; @@ -834,20 +839,20 @@ STORE(bch2_dev) if (attr == &sysfs_io_errors_reset) bch2_dev_errors_reset(ca); + int opt_id = bch2_opt_lookup(attr->name); + if (opt_id >= 0) + return sysfs_opt_store(c, ca, opt_id, buf, size); + return size; } SYSFS_OPS(bch2_dev); struct attribute *bch2_dev_files[] = { &sysfs_uuid, - &sysfs_bucket_size, &sysfs_first_bucket, &sysfs_nbuckets, - &sysfs_durability, /* settings: */ - &sysfs_discard, - &sysfs_state, &sysfs_label, &sysfs_has_data, diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h index 222cd5062702..303e0433c702 100644 --- a/fs/bcachefs/sysfs.h +++ b/fs/bcachefs/sysfs.h @@ -23,7 +23,7 @@ extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; extern const struct sysfs_ops bch2_dev_sysfs_ops; -int bch2_opts_create_sysfs_files(struct kobject *); +int bch2_opts_create_sysfs_files(struct kobject *, unsigned); #else @@ -41,7 +41,8 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; static const struct sysfs_ops bch2_dev_sysfs_ops; -static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } +static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) +{ return 0; } #endif /* NO_BCACHEFS_SYSFS */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index aed7c6984173..f9667b944c0d 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -523,7 +523,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, if (ret < 0) goto err_class_exit; - ret = bch2_opt_check_may_set(c, opt_id, v); + ret = bch2_opt_check_may_set(c, NULL, opt_id, v); if (ret < 0) goto err_class_exit; From 7b84d934a16274eaedfb38cb94b909426048c48e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 00:55:23 -0400 Subject: [PATCH 155/180] bcachefs: Setting foreground_target at runtime now triggers rebalance Signed-off-by: Kent Overstreet --- fs/bcachefs/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 4c5b585041be..b01b2ae87051 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -650,6 +650,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, if (v && (id == Opt_background_target || + (id == Opt_foreground_target && !c->opts.background_target) || id == Opt_background_compression || (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); From 8d7b7ac367cd0e7f0e496ba6526799994b3c1237 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 00:55:52 -0400 Subject: [PATCH 156/180] bcachefs: Device state is now a runtime option Other options can normally be set at runtime via sysfs, no reason for this one not to be as well - it just doesn't support the degraded flags argument this way, that requires the ioctl. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.c | 7 +++++++ fs/bcachefs/opts.h | 2 +- fs/bcachefs/sysfs.c | 3 +++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index de24af773224..d0ce96529dd4 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -482,9 +482,16 @@ void bch2_opts_to_text(struct printbuf *out, int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) { + lockdep_assert_held(&c->state_lock); + int ret = 0; switch (id) { + case Opt_state: + if (ca) + return __bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED); + break; + case Opt_compression: case Opt_background_compression: ret = bch2_check_set_has_compressed_data(c, v); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 19fcc0e51c0b..0bf39e4b1c8e 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -495,7 +495,7 @@ enum fsck_err_opts { NULL, "Skip submit_bio() for data reads and writes, " \ "for performance testing purposes") \ x(state, u64, \ - OPT_DEVICE, \ + OPT_DEVICE|OPT_RUNTIME, \ OPT_STR(bch2_member_states), \ BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ "state", "rw,ro,failed,spare") \ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index b01b2ae87051..e8a795578186 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -631,6 +631,8 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) return -EROFS; + down_write(&c->state_lock); + char *tmp = kstrdup(buf, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; @@ -664,6 +666,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, ret = size; err: + up_write(&c->state_lock); bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return ret; } From 80be08cdb5a82208a0bb67ad93b3fb7447fd2873 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Mar 2025 00:54:10 -0400 Subject: [PATCH 157/180] bcachefs: Filesystem discard option now propagates to devices the discard option is special, because it's both a filesystem and a device option. When set at the filesytsem level, it's supposed to propagate to (if set persistently via sysfs) or override (if non persistently as a mount option) the devices - that now works correctly. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 15 ++++++++++++++- fs/bcachefs/bcachefs.h | 3 ++- fs/bcachefs/fs.c | 3 +++ fs/bcachefs/sysfs.c | 9 +++++++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 4dfcf3e6fffd..54e0cc373bb1 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1806,6 +1806,19 @@ struct discard_buckets_state { u64 discarded; }; +/* + * This is needed because discard is both a filesystem option and a device + * option, and mount options are supposed to apply to that mount and not be + * persisted, i.e. if it's set as a mount option we can't propagate it to the + * device. + */ +static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) +{ + return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) + ? c->opts.discard + : ca->mi.discard; +} + static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, @@ -1869,7 +1882,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; *discard_pos_done = iter.pos; - if (ca->mi.discard && !c->opts.nochanges) { + if (discard_opt_enabled(c, ca) && !c->opts.nochanges) { /* * This works without any other locks because this is the only * thread that removes items from the need_discard tree diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 0ea593e813f4..f52311017aee 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -627,7 +627,8 @@ struct bch_dev { x(topology_error) \ x(errors_fixed) \ x(errors_not_fixed) \ - x(no_invalid_checks) + x(no_invalid_checks) \ + x(discard_mount_opt_set) \ enum bch_fs_flags { #define x(n) BCH_FS_##n, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 17ac9c55fb96..4453dd2f888e 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2172,6 +2172,9 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (ret) goto err; + if (opt_defined(opts, discard)) + set_bit(BCH_FS_discard_mount_opt_set, &c->flags); + /* Some options can't be parsed until after the fs is started: */ opts = bch2_opts_empty(); ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index e8a795578186..251ba8224c1f 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -664,6 +664,15 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, c->copygc_thread) wake_up_process(c->copygc_thread); + if (id == Opt_discard && !ca) { + mutex_lock(&c->sb_lock); + for_each_member_device(c, ca) + opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + ret = size; err: up_write(&c->state_lock); From 4a4000b9a6fd0d1de0349cd398433fd7fdf64292 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 18 Mar 2025 15:52:08 -0400 Subject: [PATCH 158/180] bcachefs: Kill JOURNAL_ERRORS() Convert these to standard error codes, which means we can pass them outside the journal code, they're easier to pass to tracepoints, etc. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 36 +++++++++-------- fs/bcachefs/errcode.h | 16 ++++++-- fs/bcachefs/journal.c | 69 ++++++++++++++------------------ fs/bcachefs/journal_reclaim.c | 4 +- fs/bcachefs/journal_types.h | 21 +--------- 5 files changed, 65 insertions(+), 81 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 892d20a50a52..d50dc31d0bea 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -880,6 +880,24 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, struct bch_fs *c = trans->c; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) { + /* + * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK + * flag + */ + if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && + watermark < BCH_WATERMARK_reclaim) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + goto out; + } + + ret = drop_locks_do(trans, + bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| + JOURNAL_RES_GET_CHECK)); + goto out; + } + switch (ret) { case -BCH_ERR_btree_insert_btree_node_full: ret = bch2_btree_split_leaf(trans, i->path, flags); @@ -891,22 +909,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ret = drop_locks_do(trans, bch2_accounting_update_sb(trans)); break; - case -BCH_ERR_journal_res_get_blocked: - /* - * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK - * flag - */ - if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; - break; - } - - ret = drop_locks_do(trans, - bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_CHECK)); - break; case -BCH_ERR_btree_insert_need_journal_reclaim: bch2_trans_unlock(trans); @@ -927,7 +929,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, BUG_ON(ret >= 0); break; } - +out: BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index cb27de6ffad6..c179954aaf33 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -218,10 +218,18 @@ x(EROFS, insufficient_devices) \ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ - x(BCH_ERR_operation_blocked, journal_res_get_blocked) \ - x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \ - x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ - x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ + x(BCH_ERR_operation_blocked, journal_res_blocked) \ + x(BCH_ERR_journal_res_blocked, journal_blocked) \ + x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \ + x(BCH_ERR_journal_res_blocked, journal_max_open) \ + x(BCH_ERR_journal_res_blocked, journal_full) \ + x(BCH_ERR_journal_res_blocked, journal_pin_full) \ + x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ + x(BCH_ERR_journal_res_blocked, journal_stuck) \ + x(BCH_ERR_journal_res_blocked, journal_retry_open) \ + x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \ + x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ + x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ x(BCH_ERR_invalid, invalid_sb) \ x(BCH_ERR_invalid_sb, invalid_sb_magic) \ x(BCH_ERR_invalid_sb, invalid_sb_version) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index fc15644c9f24..ce7302695547 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -20,13 +20,6 @@ #include "journal_seq_blacklist.h" #include "trace.h" -static const char * const bch2_journal_errors[] = { -#define x(n) #n, - JOURNAL_ERRORS() -#undef x - NULL -}; - static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; @@ -149,8 +142,8 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) bool stuck = false; struct printbuf buf = PRINTBUF; - if (!(error == JOURNAL_ERR_journal_full || - error == JOURNAL_ERR_journal_pin_full) || + if (!(error == -BCH_ERR_journal_full || + error == -BCH_ERR_journal_pin_full) || nr_unwritten_journal_entries(j) || (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) return stuck; @@ -177,7 +170,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) spin_unlock(&j->lock); bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", - bch2_journal_errors[error]); + bch2_err_str(error)); bch2_journal_debug_to_text(&buf, j); bch_err(c, "%s", buf.buf); @@ -388,32 +381,33 @@ static int journal_entry_open(struct journal *j) BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) - return JOURNAL_ERR_blocked; + return -BCH_ERR_journal_blocked; if (j->cur_entry_error) return j->cur_entry_error; - if (bch2_journal_error(j)) - return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + int ret = bch2_journal_error(j); + if (unlikely(ret)) + return ret; if (!fifo_free(&j->pin)) - return JOURNAL_ERR_journal_pin_full; + return -BCH_ERR_journal_pin_full; if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return JOURNAL_ERR_max_in_flight; + return -BCH_ERR_journal_max_in_flight; if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) - return JOURNAL_ERR_max_open; + return -BCH_ERR_journal_max_open; if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); - return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + return -BCH_ERR_journal_shutdown; } if (!j->free_buf && !buf->data) - return JOURNAL_ERR_enomem; /* will retry after write completion frees up a buf */ + return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */ BUG_ON(!j->cur_entry_sectors); @@ -437,7 +431,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= (ssize_t) j->early_journal_entries.nr) - return JOURNAL_ERR_journal_full; + return -BCH_ERR_journal_full; if (fifo_empty(&j->pin) && j->reclaim_thread) wake_up_process(j->reclaim_thread); @@ -574,20 +568,21 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, if (journal_res_get_fast(j, res, flags)) return 0; - if (bch2_journal_error(j)) - return -BCH_ERR_erofs_journal_err; + ret = bch2_journal_error(j); + if (unlikely(ret)) + return ret; if (j->blocked) - return -BCH_ERR_journal_res_get_blocked; + return -BCH_ERR_journal_blocked; if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = JOURNAL_ERR_journal_full; + ret = -BCH_ERR_journal_full; can_discard = j->can_discard; goto out; } if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = JOURNAL_ERR_max_in_flight; + ret = -BCH_ERR_journal_max_in_flight; goto out; } @@ -617,20 +612,20 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; + ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open; unlock: can_discard = j->can_discard; spin_unlock(&j->lock); out: if (likely(!ret)) return 0; - if (ret == JOURNAL_ERR_retry) + if (ret == -BCH_ERR_journal_retry_open) goto retry; if (journal_error_check_stuck(j, ret, flags)) - ret = -BCH_ERR_journal_res_get_blocked; + ret = -BCH_ERR_journal_stuck; - if (ret == JOURNAL_ERR_max_in_flight && + if (ret == -BCH_ERR_journal_max_in_flight && track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && trace_journal_entry_full_enabled()) { struct printbuf buf = PRINTBUF; @@ -647,7 +642,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, count_event(c, journal_entry_full); } - if (ret == JOURNAL_ERR_max_open && + if (ret == -BCH_ERR_journal_max_open && track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && trace_journal_entry_full_enabled()) { struct printbuf buf = PRINTBUF; @@ -668,8 +663,8 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, * Journal is full - can't rely on reclaim from work item due to * freezing: */ - if ((ret == JOURNAL_ERR_journal_full || - ret == JOURNAL_ERR_journal_pin_full) && + if ((ret == -BCH_ERR_journal_full || + ret == -BCH_ERR_journal_pin_full) && !(flags & JOURNAL_RES_GET_NONBLOCK)) { if (can_discard) { bch2_journal_do_discards(j); @@ -682,9 +677,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, } } - return ret == JOURNAL_ERR_insufficient_devices - ? -BCH_ERR_erofs_journal_err - : -BCH_ERR_journal_res_get_blocked; + return ret; } static unsigned max_dev_latency(struct bch_fs *c) @@ -714,7 +707,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, int ret; if (closure_wait_event_timeout(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK), HZ)) return ret; @@ -728,7 +721,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, remaining_wait = max(0, remaining_wait - HZ); if (closure_wait_event_timeout(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK), remaining_wait)) return ret; @@ -740,7 +733,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, printbuf_exit(&buf); closure_wait_event(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK)); return ret; } @@ -1647,7 +1640,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); prt_printf(out, "blocked:\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error)); prt_printf(out, "current entry:\t"); switch (s.cur_entry_offset) { diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index d373cd181a7f..3ed31492e1aa 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -226,7 +226,7 @@ void bch2_journal_space_available(struct journal *j) bch_err(c, "%s", buf.buf); printbuf_exit(&buf); - ret = JOURNAL_ERR_insufficient_devices; + ret = -BCH_ERR_insufficient_journal_devices; goto out; } @@ -240,7 +240,7 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) - ret = JOURNAL_ERR_journal_full; + ret = -BCH_ERR_journal_full; if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index fd82f5d80355..8e0eba776b9d 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -151,25 +151,6 @@ enum journal_flags { #undef x }; -/* Reasons we may fail to get a journal reservation: */ -#define JOURNAL_ERRORS() \ - x(ok) \ - x(retry) \ - x(blocked) \ - x(max_in_flight) \ - x(max_open) \ - x(journal_full) \ - x(journal_pin_full) \ - x(journal_stuck) \ - x(enomem) \ - x(insufficient_devices) - -enum journal_errors { -#define x(n) JOURNAL_ERR_##n, - JOURNAL_ERRORS() -#undef x -}; - typedef DARRAY(u64) darray_u64; struct journal_bio { @@ -204,7 +185,7 @@ struct journal { * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ - enum journal_errors cur_entry_error; + int cur_entry_error; unsigned cur_entry_offset_if_blocked; unsigned buf_size_want; From 5d361ae5afeef075264b369c413dbed0a0d04cfa Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 18 Mar 2025 15:50:00 +0800 Subject: [PATCH 159/180] bcachefs: Add missing smp_rmb() The smp_rmb() guarantees that reads from reservations.counter occur before accessing cur_entry_u64s. It's paired with the atomic64_try_cmpxchg in journal_entry_open. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index ab68c5c4d8d8..47828771f9c2 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -350,8 +350,10 @@ static inline int journal_res_get_fast(struct journal *j, /* * Check if there is still room in the current journal - * entry: + * entry, smp_rmb() guarantees that reads from reservations.counter + * occur before accessing cur_entry_u64s: */ + smp_rmb(); if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; From af2ff37da7ad6aabb79f57c9f7331600bd2b982d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Mar 2025 12:33:40 -0400 Subject: [PATCH 160/180] bcachefs: Fix block/btree node size defaults We're fixing option parsing in userspace, it now obeys OPT_SB_FIELD_SECTORS Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.c | 6 ++---- fs/bcachefs/opts.h | 9 ++------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index d0ce96529dd4..81fd6b7977d3 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -684,12 +684,10 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, if (opt->flags & OPT_SB_FIELD_ONE_BIAS) v++; - if ((opt->flags & OPT_FS) && opt->set_sb) + if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) opt->set_sb(sb, v); - if ((opt->flags & OPT_DEVICE) && - opt->set_member && - dev_idx >= 0) { + if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { if (WARN(!bch2_member_exists(sb, dev_idx), "tried to set device option %s on nonexistent device %i", opt->attr.name, dev_idx)) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 0bf39e4b1c8e..bb621804d45a 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -128,13 +128,13 @@ enum fsck_err_opts { OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ OPT_UINT(512, 1U << 16), \ - BCH_SB_BLOCK_SIZE, 8, \ + BCH_SB_BLOCK_SIZE, 4 << 10, \ "size", NULL) \ x(btree_node_size, u32, \ OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ OPT_UINT(512, 1U << 20), \ - BCH_SB_BTREE_NODE_SIZE, 512, \ + BCH_SB_BTREE_NODE_SIZE, 256 << 10, \ "size", "Btree node size, default 256k") \ x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ @@ -499,11 +499,6 @@ enum fsck_err_opts { OPT_STR(bch2_member_states), \ BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ "state", "rw,ro,failed,spare") \ - x(fs_size, u64, \ - OPT_DEVICE|OPT_HIDDEN, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_MEMBER_OPT, 0, \ - "size", "Size of filesystem on device") \ x(bucket_size, u32, \ OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ OPT_UINT(0, S64_MAX), \ From 2fe208303a114012d7ca035dcb37edcee93b64f6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 10:25:15 -0400 Subject: [PATCH 161/180] bcachefs: Simplify bch2_write_op_error() There's no reason for the caller to do the actual logging, it's all done the same. Signed-off-by: Kent Overstreet --- fs/bcachefs/compress.c | 15 ++---- fs/bcachefs/io_write.c | 110 ++++++++++++----------------------------- fs/bcachefs/io_write.h | 9 +--- 3 files changed, 37 insertions(+), 97 deletions(-) diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 31467f77930f..91483f83eb59 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -270,24 +270,15 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || crc->compressed_size << 9 > c->opts.encoded_extent_max) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, - "extent too big to decompress"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + bch2_write_op_error(op, op->pos.offset, "extent too big to decompress"); return -EIO; } data = __bounce_alloc(c, dst_len, WRITE); if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, - "decompression error"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } + if (!c->opts.no_data_io) + bch2_write_op_error(op, op->pos.offset, "decompression error"); ret = -EIO; goto err; } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index a2e6b30530e3..09df19654458 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -402,61 +402,36 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ -void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_write_op *op, u64 offset, const char *fmt, ...) +void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) { - if (op->subvol) - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9)); - else { - struct bpos pos = op->pos; - pos.offset = offset; - lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); - } + struct printbuf buf = PRINTBUF; - prt_str(out, "write error: "); - - va_list args; - va_start(args, fmt); - prt_vprintf(out, fmt, args); - va_end(args); - - if (op->flags & BCH_WRITE_move) { - struct data_update *u = container_of(op, struct data_update, op); - - prt_printf(out, "\n from internal move "); - bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k)); - } -} - -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64 offset, - const char *fmt, ...) -{ - if (op->subvol) - bch2_inum_offset_err_msg(op->c, out, + if (op->subvol) { + bch2_inum_offset_err_msg(op->c, &buf, (subvol_inum) { op->subvol, op->pos.inode, }, offset << 9); - else { + } else { struct bpos pos = op->pos; pos.offset = offset; - bch2_inum_snap_offset_err_msg(op->c, out, pos); + bch2_inum_snap_offset_err_msg(op->c, &buf, pos); } - prt_str(out, "write error: "); + prt_str(&buf, "write error: "); va_list args; va_start(args, fmt); - prt_vprintf(out, fmt, args); + prt_vprintf(&buf, fmt, args); va_end(args); if (op->flags & BCH_WRITE_move) { struct data_update *u = container_of(op, struct data_update, op); - prt_printf(out, "\n from internal move "); - bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k)); + prt_printf(&buf, "\n from internal move "); + bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); } + + bch_err_ratelimited(op->c, "%s", buf.buf); + printbuf_exit(&buf); } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -598,11 +573,8 @@ static void __bch2_write_index(struct bch_write_op *op) if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k), + bch2_write_op_error(op, bkey_start_offset(&insert->k), "btree update error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); } if (ret) @@ -1169,13 +1141,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, *_dst = dst; return more; csum_err: - { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, - "error verifying existing checksum while rewriting existing data (memory corruption?)"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } + bch2_write_op_error(op, op->pos.offset, + "error verifying existing checksum while rewriting existing data (memory corruption?)"); ret = -EIO; err: @@ -1255,32 +1222,29 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; for_each_keylist_key(&op->insert_keys, orig) { - int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - - struct printbuf buf = PRINTBUF; - bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k), - "btree update error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (ret) { - op->error = ret; + if (ret) break; - } } bch2_trans_put(trans); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + bch2_write_op_error(op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); + } + + if (ret) + op->error = ret; } static void __bch2_nocow_write_done(struct bch_write_op *op) @@ -1436,11 +1400,8 @@ static void bch2_nocow_write(struct bch_write_op *op) darray_exit(&buckets); if (ret) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, + bch2_write_op_error(op, op->pos.offset, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); op->error = ret; op->flags |= BCH_WRITE_submitted; } @@ -1558,13 +1519,9 @@ static void __bch2_write(struct bch_write_op *op) op->flags |= BCH_WRITE_submitted; if (unlikely(ret < 0)) { - if (!(op->flags & BCH_WRITE_alloc_nowait)) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, + if (!(op->flags & BCH_WRITE_alloc_nowait)) + bch2_write_op_error(op, op->pos.offset, "%s(): %s", __func__, bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } op->error = ret; break; } @@ -1691,10 +1648,7 @@ CLOSURE_CALLBACK(bch2_write) wbio_init(bio)->put_bio = false; if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op, op->pos.offset, - "misaligned write"); - printbuf_exit(&buf); + bch2_write_op_error(op, op->pos.offset, "misaligned write"); op->error = -EIO; goto err; } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 627730537752..b8ab19a1e1da 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -14,13 +14,8 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); -__printf(5, 6) -void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_write_op *op, u64, const char *, ...); - -__printf(4, 5) -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64, - const char *, ...); +__printf(3, 4) +void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); #define BCH_WRITE_FLAGS() \ x(alloc_nowait) \ From 127d90d2823ef45dd37246aaf7bb0392e6231c38 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 10:30:51 -0400 Subject: [PATCH 162/180] bcachefs: bch2_write_prep_encoded_data() now returns errcode Prep work for killing off EIO and replacing them with proper private error codes. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 159 ++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 88 deletions(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 09df19654458..a861f786c3db 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -434,6 +434,12 @@ void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, . printbuf_exit(&buf); } +static void bch2_write_csum_err_msg(struct bch_write_op *op) +{ + bch2_write_op_error(op, op->pos.offset, + "error verifying existing checksum while rewriting existing data (memory corruption?)"); +} + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, @@ -809,7 +815,6 @@ static int bch2_write_rechecksum(struct bch_fs *c, { struct bio *bio = &op->wbio.bio; struct bch_extent_crc_unpacked new_crc; - int ret; /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ @@ -817,10 +822,10 @@ static int bch2_write_rechecksum(struct bch_fs *c, bch2_csum_type_is_encryption(new_csum_type)) new_csum_type = op->crc.csum_type; - ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, - NULL, &new_crc, - op->crc.offset, op->crc.live_size, - new_csum_type); + int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, + NULL, &new_crc, + op->crc.offset, op->crc.live_size, + new_csum_type); if (ret) return ret; @@ -830,44 +835,12 @@ static int bch2_write_rechecksum(struct bch_fs *c, return 0; } -static int bch2_write_decrypt(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct nonce nonce = extent_nonce(op->version, op->crc); - struct bch_csum csum; - int ret; - - if (!bch2_csum_type_is_encryption(op->crc.csum_type)) - return 0; - - /* - * If we need to decrypt data in the write path, we'll no longer be able - * to verify the existing checksum (poly1305 mac, in this case) after - * it's decrypted - this is the last point we'll be able to reverify the - * checksum: - */ - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return -EIO; - - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - return ret; -} - -static enum prep_encoded_ret { - PREP_ENCODED_OK, - PREP_ENCODED_ERR, - PREP_ENCODED_CHECKSUM_ERR, - PREP_ENCODED_DO_WRITE, -} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) +static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) { struct bch_fs *c = op->c; struct bio *bio = &op->wbio.bio; - - if (!(op->flags & BCH_WRITE_data_encoded)) - return PREP_ENCODED_OK; + struct nonce nonce = extent_nonce(op->version, op->crc); + int ret = 0; BUG_ON(bio_sectors(bio) != op->crc.compressed_size); @@ -878,12 +851,13 @@ static enum prep_encoded_ret { (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || op->incompressible)) { if (!crc_is_compressed(op->crc) && - op->csum_type != op->crc.csum_type && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + op->csum_type != op->crc.csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } - return PREP_ENCODED_DO_WRITE; + return 1; } /* @@ -891,20 +865,23 @@ static enum prep_encoded_ret { * is, we have to decompress it: */ if (crc_is_compressed(op->crc)) { - struct bch_csum csum; - - if (bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; - /* Last point we can still verify checksum: */ - csum = bch2_checksum_bio(c, op->crc.csum_type, - extent_nonce(op->version, op->crc), - bio); + struct bch_csum csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + goto csum_err; - if (bch2_bio_uncompress_inplace(op, bio)) - return PREP_ENCODED_ERR; + if (bch2_csum_type_is_encryption(op->crc.csum_type)) { + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; + + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } + + ret = bch2_bio_uncompress_inplace(op, bio); + if (ret) + return ret; } /* @@ -916,22 +893,34 @@ static enum prep_encoded_ret { * If the data is checksummed and we're only writing a subset, * rechecksum and adjust bio to point to currently live data: */ - if ((op->crc.live_size != op->crc.uncompressed_size || - op->crc.csum_type != op->csum_type) && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + if (op->crc.live_size != op->crc.uncompressed_size || + op->crc.csum_type != op->csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } /* * If we want to compress the data, it has to be decrypted: */ - if ((op->compression_opt || - bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(op->csum_type)) && - bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; + if (bch2_csum_type_is_encryption(op->crc.csum_type) && + (op->compression_opt || op->crc.csum_type != op->csum_type)) { + struct bch_csum csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + goto csum_err; - return PREP_ENCODED_OK; + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; + + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } + + return 0; +csum_err: + bch2_write_csum_err_msg(op); + return -EIO; } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -950,25 +939,21 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ec_buf = bch2_writepoint_ec_buf(c, wp); - switch (bch2_write_prep_encoded_data(op, wp)) { - case PREP_ENCODED_OK: - break; - case PREP_ENCODED_ERR: - ret = -EIO; - goto err; - case PREP_ENCODED_CHECKSUM_ERR: - goto csum_err; - case PREP_ENCODED_DO_WRITE: - /* XXX look for bug here */ - if (ec_buf) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bio_copy_data(dst, src); - bounce = true; + if (unlikely(op->flags & BCH_WRITE_data_encoded)) { + ret = bch2_write_prep_encoded_data(op, wp); + if (ret < 0) + goto err; + if (ret) { + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } + init_append_extent(op, wp, op->version, op->crc); + goto do_write; } - init_append_extent(op, wp, op->version, op->crc); - goto do_write; } if (ec_buf || @@ -1141,9 +1126,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, *_dst = dst; return more; csum_err: - bch2_write_op_error(op, op->pos.offset, - "error verifying existing checksum while rewriting existing data (memory corruption?)"); - + bch2_write_csum_err_msg(op); ret = -EIO; err: if (to_wbio(dst)->bounce) From 8a9f3d058279ed0f99114e0449d129fb5abc5eca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 10:16:48 -0400 Subject: [PATCH 163/180] bcachefs: EIO cleanup Replace these with proper private error codes, so that when we get an error message we're not sifting through the entire codebase to see where it came from. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 4 +-- fs/bcachefs/alloc_foreground.c | 4 +-- fs/bcachefs/alloc_foreground.h | 2 +- fs/bcachefs/checksum.c | 2 +- fs/bcachefs/compress.c | 56 ++++++++++++++++++---------------- fs/bcachefs/data_update.c | 2 +- fs/bcachefs/ec.c | 16 +++++----- fs/bcachefs/ec.h | 2 +- fs/bcachefs/errcode.h | 18 ++++++++++- fs/bcachefs/inode.c | 4 +-- fs/bcachefs/io_write.c | 14 ++++----- fs/bcachefs/journal_io.c | 2 +- fs/bcachefs/journal_reclaim.c | 6 ++-- 13 files changed, 73 insertions(+), 59 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 54e0cc373bb1..2828baa9b162 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -837,7 +837,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); if (!ca) - return -EIO; + return -BCH_ERR_trigger_alloc; struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); @@ -1031,7 +1031,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, invalid_bucket: bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = -EIO; + ret = -BCH_ERR_trigger_alloc; goto err; } diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 1759c15a7745..95aafc232290 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) void bch2_open_bucket_write_error(struct bch_fs *c, struct open_buckets *obs, - unsigned dev) + unsigned dev, int err) { struct open_bucket *ob; unsigned i; open_bucket_for_each(c, obs, ob, i) if (ob->dev == dev && ob->ec) - bch2_ec_bucket_cancel(c, ob); + bch2_ec_bucket_cancel(c, ob, err); } static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index baf5dc163c8a..69ec6a012898 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -82,7 +82,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, } void bch2_open_bucket_write_error(struct bch_fs *, - struct open_buckets *, unsigned); + struct open_buckets *, unsigned, int); void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 7f9e4c59950c..3726689093e3 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -466,7 +466,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, prt_str(&buf, ")"); WARN_RATELIMIT(1, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_recompute_checksum; } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 91483f83eb59..85fc90342492 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, size_t src_len = src->bi_iter.bi_size; size_t dst_len = crc.uncompressed_size << 9; void *workspace; - int ret; + int ret = 0, ret2; enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); mempool_t *workspace_pool = &c->compress_workspace[opt]; @@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, else ret = -BCH_ERR_compression_workspace_not_initialized; if (ret) - goto out; + goto err; } src_data = bio_map_or_bounce(c, src, READ); @@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, switch (crc.compression_type) { case BCH_COMPRESSION_TYPE_lz4_old: case BCH_COMPRESSION_TYPE_lz4: - ret = LZ4_decompress_safe_partial(src_data.b, dst_data, - src_len, dst_len, dst_len); - if (ret != dst_len) - goto err; + ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_lz4; break; case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { @@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); - ret = zlib_inflate(&strm, Z_FINISH); + ret2 = zlib_inflate(&strm, Z_FINISH); mempool_free(workspace, workspace_pool); - if (ret != Z_STREAM_END) - goto err; + if (ret2 != Z_STREAM_END) + ret = -BCH_ERR_decompress_gzip; break; } case BCH_COMPRESSION_TYPE_zstd: { ZSTD_DCtx *ctx; size_t real_src_len = le32_to_cpup(src_data.b); - if (real_src_len > src_len - 4) + if (real_src_len > src_len - 4) { + ret = -BCH_ERR_decompress_zstd_src_len_bad; goto err; + } workspace = mempool_alloc(workspace_pool, GFP_NOFS); ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - ret = zstd_decompress_dctx(ctx, + ret2 = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); mempool_free(workspace, workspace_pool); - if (ret != dst_len) - goto err; + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_zstd; break; } default: BUG(); } - ret = 0; +err: fsck_err: -out: bio_unmap_or_unbounce(c, src_data); return ret; -err: - ret = -EIO; - goto out; } int bch2_bio_uncompress_inplace(struct bch_write_op *op, @@ -268,18 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, BUG_ON(!bio->bi_vcnt); BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || - crc->compressed_size << 9 > c->opts.encoded_extent_max) { - bch2_write_op_error(op, op->pos.offset, "extent too big to decompress"); - return -EIO; + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) { + bch2_write_op_error(op, op->pos.offset, + "extent too big to decompress (%u > %u)", + crc->uncompressed_size << 9, c->opts.encoded_extent_max); + return -BCH_ERR_decompress_exceeded_max_encoded_extent; } data = __bounce_alloc(c, dst_len, WRITE); - if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) - bch2_write_op_error(op, op->pos.offset, "decompression error"); - ret = -EIO; + ret = __bio_uncompress(c, bio, data.b, *crc); + + if (c->opts.no_data_io) + ret = 0; + + if (ret) { + bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret)); goto err; } @@ -312,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) - return -EIO; + return -BCH_ERR_decompress_exceeded_max_encoded_extent; dst_data = dst_len == dst_iter.bi_size ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 08bb7f3019ce..0ec273daccb7 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -354,7 +354,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, printbuf_exit(&buf); bch2_fatal_error(c); - ret = -EIO; + ret = -BCH_ERR_invalid_bkey; goto out; } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c73ba73f6890..f2b9225fe0bc 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1124,7 +1124,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_erasure_coding_found_btree_node; } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); @@ -1190,7 +1190,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); if (!ca) - return -EIO; + return -BCH_ERR_ENOENT_dev_not_found; struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); @@ -1227,21 +1227,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { struct btree_trans *trans = bch2_trans_get(c); struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - int ret = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; - ret = bch2_btree_write_buffer_flush_sync(trans); + int ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; - for (i = 0; i < nr_data; i++) { + for (unsigned i = 0; i < nr_data; i++) { ret = ec_stripe_update_bucket(trans, s, i); if (ret) break; } err: bch2_trans_put(trans); - return ret; } @@ -1451,11 +1449,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int ec_stripe_new_set_pending(c, h); } -void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) { struct ec_stripe_new *s = ob->ec; - s->err = -EIO; + s->err = err; } void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 8f2228e59eda..62d27e04d763 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -249,7 +249,7 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index c179954aaf33..101806d7ebe1 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -116,6 +116,7 @@ x(ENOENT, ENOENT_snapshot_tree) \ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ + x(ENOENT, ENOENT_dev_bucket_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOENT, ENOENT_inode_no_backpointer) \ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ @@ -207,6 +208,7 @@ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ x(EINVAL, varint_decode_error) \ + x(EINVAL, erasure_coding_found_btree_node) \ x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ @@ -267,6 +269,7 @@ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, journal_shutdown) \ x(EIO, journal_flush_err) \ + x(EIO, journal_write_err) \ x(EIO, btree_node_read_err) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ @@ -275,6 +278,7 @@ x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ + x(EIO, trigger_alloc) \ x(EIO, trigger_pointer) \ x(EIO, trigger_stripe_pointer) \ x(EIO, metadata_bucket_inconsistency) \ @@ -290,7 +294,19 @@ x(EIO, EIO_fault_injected) \ x(EIO, ec_block_read) \ x(EIO, ec_block_write) \ - x(EIO, data_read) \ + x(EIO, recompute_checksum) \ + x(EIO, decompress) \ + x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \ + x(BCH_ERR_decompress, decompress_lz4) \ + x(BCH_ERR_decompress, decompress_gzip) \ + x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \ + x(BCH_ERR_decompress, decompress_zstd) \ + x(EIO, data_write) \ + x(BCH_ERR_data_write, data_write_io) \ + x(BCH_ERR_data_write, data_write_csum) \ + x(BCH_ERR_data_write, data_write_invalid_ptr) \ + x(BCH_ERR_data_write, data_write_misaligned) \ + x(BCH_ERR_decompress, data_read) \ x(BCH_ERR_data_read, no_device_to_read_from) \ x(BCH_ERR_data_read, data_read_io_err) \ x(BCH_ERR_data_read, data_read_csum_err) \ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 7aca010e2e10..1383fdcc42a5 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1079,7 +1079,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } @@ -1243,7 +1243,7 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index a861f786c3db..29671075e3f1 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -535,7 +535,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -EIO; + return -BCH_ERR_data_write_io; } if (dst != src) @@ -589,7 +589,7 @@ static void __bch2_write_index(struct bch_write_op *op) out: /* If some a bucket wasn't written, we can't erasure code it: */ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev); + bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); bch2_open_buckets_put(c, &op->open_buckets); return; @@ -920,7 +920,7 @@ static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct return 0; csum_err: bch2_write_csum_err_msg(op); - return -EIO; + return -BCH_ERR_data_write_csum; } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -1127,7 +1127,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, return more; csum_err: bch2_write_csum_err_msg(op); - ret = -EIO; + ret = -BCH_ERR_data_write_csum; err: if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); @@ -1233,7 +1233,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = -EIO; + op->error = -BCH_ERR_data_write_io; } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1424,7 +1424,7 @@ static void bch2_nocow_write(struct bch_write_op *op) "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_data_write_invalid_ptr; } else { /* We can retry this: */ ret = -BCH_ERR_transaction_restart; @@ -1632,7 +1632,7 @@ CLOSURE_CALLBACK(bch2_write) if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { bch2_write_op_error(op, op->pos.offset, "misaligned write"); - op->error = -EIO; + op->error = -BCH_ERR_data_write_misaligned; goto err; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index cf2700b06d58..4ed6137f0439 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1624,7 +1624,7 @@ static CLOSURE_CALLBACK(journal_write_done) if (!w->devs_written.nr) { bch_err(c, "unable to write journal to sufficient devices"); - err = -EIO; + err = -BCH_ERR_journal_write_err; } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 3ed31492e1aa..5d1547aa118a 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -645,7 +645,6 @@ static u64 journal_seq_to_flush(struct journal *j) * @j: journal object * @direct: direct or background reclaim? * @kicked: requested to run since we last ran? - * Returns: 0 on success, or -EIO if the journal has been shutdown * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -685,10 +684,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (kthread && kthread_should_stop()) break; - if (bch2_journal_error(j)) { - ret = -EIO; + ret = bch2_journal_error(j); + if (ret) break; - } bch2_journal_do_discards(j); From 4fcd4de0a659a1b9b151d9f88e6ec67a6c05fba5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 10:53:52 -0400 Subject: [PATCH 164/180] bcachefs: fs-common.c -> namei.c name <-> inode, code for managing the relationships between inodes and dirents. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 2 +- fs/bcachefs/error.c | 2 +- fs/bcachefs/fs-ioctl.c | 2 +- fs/bcachefs/fs.c | 2 +- fs/bcachefs/fsck.c | 2 +- fs/bcachefs/{fs-common.c => namei.c} | 2 +- fs/bcachefs/{fs-common.h => namei.h} | 6 +++--- fs/bcachefs/recovery.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) rename fs/bcachefs/{fs-common.c => namei.c} (99%) rename fs/bcachefs/{fs-common.h => namei.h} (93%) diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 1cf17a16af9f..9af65079374f 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -41,7 +41,6 @@ bcachefs-y := \ extent_update.o \ eytzinger.o \ fs.o \ - fs-common.o \ fs-ioctl.o \ fs-io.o \ fs-io-buffered.o \ @@ -64,6 +63,7 @@ bcachefs-y := \ migrate.o \ move.o \ movinggc.o \ + namei.o \ nocow_locking.o \ opts.o \ printbuf.o \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 6d68c89a49b2..207f35d3cce2 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -3,8 +3,8 @@ #include "btree_cache.h" #include "btree_iter.h" #include "error.h" -#include "fs-common.h" #include "journal.h" +#include "namei.h" #include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 5b47b94fe1ea..e3a3230fc652 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -5,8 +5,8 @@ #include "chardev.h" #include "dirent.h" #include "fs.h" -#include "fs-common.h" #include "fs-ioctl.h" +#include "namei.h" #include "quota.h" #include diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 4453dd2f888e..273078ceb4df 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -11,7 +11,6 @@ #include "errcode.h" #include "extents.h" #include "fs.h" -#include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" #include "fs-io-buffered.h" @@ -22,6 +21,7 @@ #include "io_read.h" #include "journal.h" #include "keylist.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "snapshot.h" diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0e85131d0af8..4271ce4a4c8c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -10,10 +10,10 @@ #include "dirent.h" #include "error.h" #include "fs.h" -#include "fs-common.h" #include "fsck.h" #include "inode.h" #include "keylist.h" +#include "namei.h" #include "recovery_passes.h" #include "snapshot.h" #include "super.h" diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/namei.c similarity index 99% rename from fs/bcachefs/fs-common.c rename to fs/bcachefs/namei.c index fbc3da59536c..bc83acbf5414 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/namei.c @@ -4,8 +4,8 @@ #include "acl.h" #include "btree_update.h" #include "dirent.h" -#include "fs-common.h" #include "inode.h" +#include "namei.h" #include "subvolume.h" #include "xattr.h" diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/namei.h similarity index 93% rename from fs/bcachefs/fs-common.h rename to fs/bcachefs/namei.h index 2b59210bb5e8..7383b76270e9 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/namei.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_COMMON_H -#define _BCACHEFS_FS_COMMON_H +#ifndef _BCACHEFS_NAMEI_H +#define _BCACHEFS_NAMEI_H #include "dirent.h" @@ -44,4 +44,4 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -#endif /* _BCACHEFS_FS_COMMON_H */ +#endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index a6e26733854d..266c5770c824 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -13,12 +13,12 @@ #include "disk_accounting.h" #include "errcode.h" #include "error.h" -#include "fs-common.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "logged_ops.h" #include "move.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" From 758ea4ff812b4dfd4cef6dba0eb4b0025a7b147e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 11:06:50 -0400 Subject: [PATCH 165/180] bcachefs: Move bch2_check_dirent_target() to namei.c We're gradually running more and more fsck.c checks at runtime, whereever applicable; when we do so they get moved out of fsck.c. Next patch will call bch2_check_dirent_target() from bch2_lookup(). Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 51 ++++++++++ fs/bcachefs/dirent.h | 2 + fs/bcachefs/fsck.c | 229 +------------------------------------------ fs/bcachefs/namei.c | 173 ++++++++++++++++++++++++++++++++ fs/bcachefs/namei.h | 5 + 5 files changed, 236 insertions(+), 224 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index f4c283d1e86a..d7f9f79318a2 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -729,3 +729,54 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) return ret < 0 ? ret : 0; } + +/* fsck */ + +static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; + } + ret = -BCH_ERR_ENOENT_inode; +found: + bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; + int ret; + + ret = lookup_first_inode(trans, pos.inode, &dir_inode); + if (ret) + goto err; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &iter); +err: + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index a6e15a012936..0880772b80a9 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -82,4 +82,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); +int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); + #endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 4271ce4a4c8c..f3853b741937 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -23,13 +23,6 @@ #include #include /* struct qstr */ -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, struct bch_inode_unpacked *inode) { @@ -116,29 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode_nr) - break; - if (!bkey_is_inode(k.k)) - continue; - ret = bch2_inode_unpack(k, inode); - goto found; - } - ret = -BCH_ERR_ENOENT_inode; -found: - bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, struct bch_inode_unpacked *inode) { @@ -179,32 +149,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, return 0; } -static int __remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - int ret; - - ret = lookup_first_inode(trans, pos.inode, &dir_inode); - if (ret) - goto err; - - dir_hash_info = bch2_hash_info_init(c, &dir_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); -err: - bch_err_fn(c, ret); - return ret; -} - /* * Find any subvolume associated with a tree of snapshots * We can't rely on master_subvol - it might have been deleted. @@ -548,7 +492,7 @@ static int remove_backpointer(struct btree_trans *trans, SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); int ret = bkey_err(d) ?: dirent_points_to_inode(c, d, inode) ?: - __remove_dirent(trans, d.k->p); + bch2_fsck_remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1985,169 +1929,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa trans_was_restarted(trans, restart_count); } -noinline_for_stack -static int check_dirent_inode_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = { NULL }; - int ret = 0; - - if (inode_points_to_dirent(target, d)) - return 0; - - if (!target->bi_dir && - !target->bi_dir_offset) { - fsck_err_on(S_ISDIR(target->bi_mode), - trans, inode_dir_missing_backpointer, - "directory with missing backpointer\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - fsck_err_on(target->bi_flags & BCH_INODE_unlinked, - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - target->bi_flags &= ~BCH_INODE_unlinked; - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - return __bch2_fsck_write_inode(trans, target); - } - - if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf))) - goto err; - - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; - - if (fsck_err_on(!backpointer_exists, - trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - goto out; - } - - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - - if (fsck_err_on(backpointer_exists && - (S_ISDIR(target->bi_mode) || - target->bi_subvol), - trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } - - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -noinline_for_stack -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = check_dirent_inode_dirent(trans, iter, d, target); - if (ret) - goto err; - - if (fsck_err_on(d.v->d_type != inode_d_type(target), - trans, dirent_d_type_wrong, - "incorrect d_type: got %s, should be %s:\n%s", - bch2_d_type_str(d.v->d_type), - bch2_d_type_str(inode_d_type(target)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = inode_d_type(target); - if (n->v.d_type == DT_SUBVOL) { - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); - n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); - } else { - n->v.d_inum = cpu_to_le64(target->bi_inum); - } - - ret = bch2_trans_update(trans, iter, &n->k_i, 0); - if (ret) - goto err; - - d = dirent_i_to_s_c(n); - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - /* find a subvolume that's a descendent of @snapshot: */ static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) { @@ -2247,7 +2028,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (fsck_err(trans, dirent_to_missing_subvol, "dirent points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) - return __remove_dirent(trans, d.k->p); + return bch2_fsck_remove_dirent(trans, d.k->p); ret = 0; goto out; } @@ -2291,7 +2072,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto err; } - ret = check_dirent_target(trans, iter, d, &subvol_root); + ret = bch2_check_dirent_target(trans, iter, d, &subvol_root); if (ret) goto err; out: @@ -2378,13 +2159,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = __remove_dirent(trans, d.k->p); + ret = bch2_fsck_remove_dirent(trans, d.k->p); if (ret) goto err; } darray_for_each(target->inodes, i) { - ret = check_dirent_target(trans, iter, d, &i->inode); + ret = bch2_check_dirent_target(trans, iter, d, &i->inode); if (ret) goto err; } diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index bc83acbf5414..4d0ee85e5016 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -564,6 +564,8 @@ int bch2_rename_trans(struct btree_trans *trans, return ret; } +/* inum_to_path */ + static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) { bch2_printbuf_make_room(out, n); @@ -654,3 +656,174 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb prt_str_reversed(path, "(disconnected)"); goto out; } + +/* fsck */ + +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; + int ret = 0; + + if (inode_points_to_dirent(target, d)) + return 0; + + if (!target->bi_dir && + !target->bi_dir_offset) { + fsck_err_on(S_ISDIR(target->bi_mode), + trans, inode_dir_missing_backpointer, + "directory with missing backpointer\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + fsck_err_on(target->bi_flags & BCH_INODE_unlinked, + trans, inode_unlinked_but_has_dirent, + "inode unlinked but has dirent\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + target->bi_flags &= ~BCH_INODE_unlinked; + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + return __bch2_fsck_write_inode(trans, target); + } + + if (bch2_inode_should_have_single_bp(target) && + !fsck_err(trans, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf))) + goto err; + + struct bkey_s_c_dirent bp_dirent = + bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, + SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), + 0, dirent); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + bool backpointer_exists = !ret; + ret = 0; + + if (fsck_err_on(!backpointer_exists, + trans, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target->bi_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target); + goto out; + } + + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (fsck_err_on(backpointer_exists && + (S_ISDIR(target->bi_mode) || + target->bi_subvol), + trans, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf)) { + ret = bch2_fsck_remove_dirent(trans, d.k->p); + goto out; + } + + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(backpointer_exists && !target->bi_nlink, + trans, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target); + if (ret) + goto err; + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +int bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = bch2_check_dirent_inode_dirent(trans, iter, d, target); + if (ret) + goto err; + + if (fsck_err_on(d.v->d_type != inode_d_type(target), + trans, dirent_d_type_wrong, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); + } else { + n->v.d_inum = cpu_to_le64(target->bi_inum); + } + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) + goto err; + + d = dirent_i_to_s_c(n); + } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h index 7383b76270e9..48a2c8cb5fa9 100644 --- a/fs/bcachefs/namei.h +++ b/fs/bcachefs/namei.h @@ -44,4 +44,9 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); +int bch2_check_dirent_target(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c_dirent, + struct bch_inode_unpacked *); + #endif /* _BCACHEFS_NAMEI_H */ From 9b0d00a3693bbab49dcec00dd981c8661d6011bf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 11:06:50 -0400 Subject: [PATCH 166/180] bcachefs: Refactor bch2_check_dirent_target() Prep work for calling bch2_check_dirent_target() from bch2_lookup(). - Add an inline wrapper, if the target and backpointer match we can skip the function call. - We don't (yet?) want to remove the dirent we did the lookup from (when we find a directory or subvol with multiple valid dirents pointing to it), we can defer on that until later. For now, add an "are we in fsck?" parameter. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 4 +- fs/bcachefs/inode.h | 1 + fs/bcachefs/namei.c | 127 +++++++++++++++++++++++--------------------- fs/bcachefs/namei.h | 28 ++++++++-- 4 files changed, 93 insertions(+), 67 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index f3853b741937..091057023fc5 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2072,7 +2072,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto err; } - ret = bch2_check_dirent_target(trans, iter, d, &subvol_root); + ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); if (ret) goto err; out: @@ -2165,7 +2165,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, } darray_for_each(target->inodes, i) { - ret = bch2_check_dirent_target(trans, iter, d, &i->inode); + ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); if (ret) goto err; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 428b9be6af34..f82cfbf460d0 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -277,6 +277,7 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; return S_ISDIR(inode->bi_mode) || + inode->bi_subvol || (!inode->bi_nlink && inode_has_bp); } diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 4d0ee85e5016..93246ad31541 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -659,17 +659,10 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb /* fsck */ -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; @@ -725,52 +718,65 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, bool backpointer_exists = !ret; ret = 0; - if (fsck_err_on(!backpointer_exists, - trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - goto out; - } - - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) + if (!backpointer_exists) { + if (fsck_err(trans, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target->bi_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target); + } + } else { + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - if (fsck_err_on(backpointer_exists && - (S_ISDIR(target->bi_mode) || - target->bi_subvol), - trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) { - ret = bch2_fsck_remove_dirent(trans, d.k->p); - goto out; - } + if (S_ISDIR(target->bi_mode) || target->bi_subvol) { + /* + * XXX: verify connectivity of the other dirent + * up to the root before removing this one + * + * Additionally, bch2_lookup would need to cope with the + * dirent it found being removed - or should we remove + * the other one, even though the inode points to it? + */ + if (in_fsck) { + if (fsck_err(trans, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf)) + ret = bch2_fsck_remove_dirent(trans, d.k->p); + } else { + bch2_fs_inconsistent(c, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf); + } - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; + goto out; + } else { + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(!target->bi_nlink, + trans, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target); + if (ret) + goto err; + } + } } out: err: @@ -781,16 +787,17 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, return ret; } -int bch2_check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) +int __bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; - ret = bch2_check_dirent_inode_dirent(trans, iter, d, target); + ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); if (ret) goto err; @@ -815,11 +822,9 @@ int bch2_check_dirent_target(struct btree_trans *trans, n->v.d_inum = cpu_to_le64(target->bi_inum); } - ret = bch2_trans_update(trans, iter, &n->k_i, 0); + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); if (ret) goto err; - - d = dirent_i_to_s_c(n); } err: fsck_err: diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h index 48a2c8cb5fa9..2e6f6364767f 100644 --- a/fs/bcachefs/namei.h +++ b/fs/bcachefs/namei.h @@ -44,9 +44,29 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -int bch2_check_dirent_target(struct btree_trans *, - struct btree_iter *, - struct bkey_s_c_dirent, - struct bch_inode_unpacked *); +int __bch2_check_dirent_target(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c_dirent, + struct bch_inode_unpacked *, bool); + +static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static inline int bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + if (likely(inode_points_to_dirent(target, d) && + d.v->d_type == inode_d_type(target))) + return 0; + + return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); +} #endif /* _BCACHEFS_NAMEI_H */ From 04e90891be26a240e41d51d1770de56e810fda5e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 11:41:07 -0400 Subject: [PATCH 167/180] bcachefs: Run bch2_check_dirent_target() at lookup time More on the "full online self healing" project: We now run most of the dirent <-> inode consistency checks, with repair code, at runtime - the exact same check and repair code that fsck runs. This will allow us to repair the "dirent points to inode that does not point back" inconsistencies that have been popping up at runtime. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 273078ceb4df..fbca200f7636 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -641,7 +641,9 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, &inum); if (ret > 0) ret = -ENOENT; if (ret) @@ -651,30 +653,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (inode) goto out; + /* + * Note: if check/repair needs it, we commit before + * bch2_inode_hash_init_insert(), as after that point we can't take a + * restart - not in the top level loop with a commit_do(), like we + * usually do: + */ + struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + /* + * don't remove it: check_inodes might find another inode that points + * back to this dirent + */ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "dirent to missing inode:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); if (ret) goto err; - - /* regular files may have hardlinks: */ - if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && - !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), - c, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, &inode_u), - buf.buf))) { - ret = -ENOENT; - goto err; - } out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); From 6a9f681ef623ae3804bc2ca3a2d06d2458142975 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 11:53:50 -0400 Subject: [PATCH 168/180] bcachefs: Count BCH_DATA_parity backpointers correctly These are counted as stripe data in the corresponding alloc keys. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index c9dfc3657696..8da1b68821a0 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -50,6 +50,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_str(out, " data_type="); + bch2_prt_data_type(out, bp.v->data_type); prt_printf(out, " suboffset=%u len=%u gen=%u pos=", (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), bp.v->bucket_len, @@ -791,6 +793,7 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t case BCH_DATA_cached: return ALLOC_cached; case BCH_DATA_stripe: + case BCH_DATA_parity: return ALLOC_stripe; default: BUG(); From 962322475bb5cebe0da581f6f18d23b00184aa01 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Mar 2025 17:01:38 -0400 Subject: [PATCH 169/180] bcachefs: Handle backpointers with unknown data types New data types might be added later, so we don't want to disallow unknown data types - that'll be a compatibility hassle later. Instead, ignore them. Reported-by: syzbot+3a290f5ff67ca3023834@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 8da1b68821a0..20c497f0c2cb 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -784,7 +784,7 @@ enum alloc_sector_counter { ALLOC_SECTORS_NR }; -static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) +static int data_type_to_alloc_counter(enum bch_data_type t) { switch (t) { case BCH_DATA_btree: @@ -796,7 +796,7 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t case BCH_DATA_parity: return ALLOC_stripe; default: - BUG(); + return -1; } } @@ -847,7 +847,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b if (bp.v->bucket_gen != a->gen) continue; - sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; + int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); + if (alloc_counter < 0) + continue; + + sectors[alloc_counter] += bp.v->bucket_len; }; bch2_trans_iter_exit(trans, &iter); if (ret) From 9c3a2c9b471aa42b13c26c916f6a0852899a57e0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 12:38:59 -0400 Subject: [PATCH 170/180] bcachefs: Disable asm memcpys when kmsan enabled kmsan doesn't know about inline assembly, obviously; this will close a ton of syzbot bugs. Signed-off-by: Kent Overstreet --- fs/bcachefs/util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index d41e133acc4d..7d921fc920a0 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -431,7 +431,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src, static inline void __memcpy_u64s(void *dst, const void *src, unsigned u64s) { -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("rep ; movsq" @@ -508,7 +508,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src, u64 *dst = (u64 *) _dst + u64s - 1; u64 *src = (u64 *) _src + u64s - 1; -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("std ;\n" From 53cf2a3daa4ca5f0a40eeb18c2be8724f123a63c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 13:24:50 -0400 Subject: [PATCH 171/180] bcachefs: Fix kmsan warnings in bch2_extent_crc_pack() We store to all fields, so the kmsan warnings were spurious - but initializing via stores to bitfields appear to have been giving the compiler/kmsan trouble, and they're not necessary. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 04946d9911f5..ae1a1d917805 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -592,29 +592,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, struct bch_extent_crc_unpacked src, enum bch_extent_entry_type type) { -#define set_common_fields(_dst, _src) \ - _dst.type = 1 << type; \ - _dst.csum_type = _src.csum_type, \ - _dst.compression_type = _src.compression_type, \ - _dst._compressed_size = _src.compressed_size - 1, \ - _dst._uncompressed_size = _src.uncompressed_size - 1, \ - _dst.offset = _src.offset +#define common_fields(_src) \ + .type = BIT(type), \ + .csum_type = _src.csum_type, \ + .compression_type = _src.compression_type, \ + ._compressed_size = _src.compressed_size - 1, \ + ._uncompressed_size = _src.uncompressed_size - 1, \ + .offset = _src.offset switch (type) { case BCH_EXTENT_ENTRY_crc32: - set_common_fields(dst->crc32, src); - dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); + dst->crc32 = (struct bch_extent_crc32) { + common_fields(src), + .csum = (u32 __force) *((__le32 *) &src.csum.lo), + }; break; case BCH_EXTENT_ENTRY_crc64: - set_common_fields(dst->crc64, src); - dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = (u64 __force) src.csum.lo; - dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); + dst->crc64 = (struct bch_extent_crc64) { + common_fields(src), + .nonce = src.nonce, + .csum_lo = (u64 __force) src.csum.lo, + .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), + }; break; case BCH_EXTENT_ENTRY_crc128: - set_common_fields(dst->crc128, src); - dst->crc128.nonce = src.nonce; - dst->crc128.csum = src.csum; + dst->crc128 = (struct bch_extent_crc128) { + common_fields(src), + .nonce = src.nonce, + .csum = src.csum, + }; break; default: BUG(); From 28aa859b6b422da5c982610d0add9128f813e9f2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 14:17:53 -0400 Subject: [PATCH 172/180] bcachefs: kmsan asserts Catching these early makes them a lot easier to track down. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 1 + fs/bcachefs/btree_update.c | 2 ++ fs/bcachefs/btree_update.h | 2 ++ 3 files changed, 5 insertions(+) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index d50dc31d0bea..7d7e52ddde02 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -164,6 +164,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); + kmsan_check_memory(insert, bkey_bytes(&insert->k)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index b3e346b5f8d7..bd2eb42edb24 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -512,6 +512,8 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 47d8690f01bf..d2e1c04353f6 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -133,6 +133,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr enum btree_id btree, struct bkey_i *k) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + if (unlikely(!btree_type_uses_write_buffer(btree))) { int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); dump_stack(); From 1f88c35674954fbb0b14d994c5fa02c7c5190356 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 14:54:49 -0400 Subject: [PATCH 173/180] bcachefs: Fix a KMSAN splat in btree_update_nodes_written() We may sometimes read from uninitialized memory; we know, and that's ok. We check if a btree node has been reused before waiting on any outstanding IO. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d3e0cf01ba37..67f1e3202835 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -649,6 +649,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, return 0; } +/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ +static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) +{ + struct btree_node *b_data = READ_ONCE(b->data); + + return (b_data ? b_data->keys.seq : 0) == seq; +} + static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; @@ -677,17 +685,9 @@ static void btree_update_nodes_written(struct btree_update *as) * on disk: */ for (i = 0; i < as->nr_old_nodes; i++) { - __le64 seq; - b = as->old_nodes[i]; - bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - seq = b->data ? b->data->keys.seq : 0; - six_unlock_read(&b->c.lock); - bch2_trans_unlock_long(trans); - - if (seq == as->old_nodes_seq[i]) + if (btree_node_seq_matches(b, as->old_nodes_seq[i])) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } From 9ea24b287b3b9118a157509d931e7d27414e98c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Mar 2025 14:15:33 -0400 Subject: [PATCH 174/180] bcachefs: Eliminate padding in move_bucket_key We appear to be tripping over a compiler/kmsan bug with padding fields - this is an easy workaround. Signed-off-by: Kent Overstreet --- fs/bcachefs/move_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index 82e473ed48d2..807f779f6f76 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -32,7 +32,7 @@ struct bch_move_stats { struct move_bucket_key { struct bpos bucket; - u8 gen; + unsigned gen; }; struct move_bucket { From 5ae6f33053af6e904e609593d05e4faf3aeb16fb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 11:30:09 -0400 Subject: [PATCH 175/180] bcachefs: zero init journal bios fix a kmsan splat Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index ce7302695547..bfdaea6569ae 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1510,7 +1510,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) return -BCH_ERR_ENOMEM_dev_journal_init; From f4a584f4bf64e0db30312088d504d4da29ca556b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 12:29:56 -0400 Subject: [PATCH 176/180] bcachefs: bch2_disk_accounting_mod2() We're hitting some issues with uninitialized struct padding, flagged by kmsan. They appear to be falso positives, otherwise bch2_accounting_validate() would have flagged them as "junk at end". But for now, we'll need to initialize disk_accounting_pos with memset(). This adds a new helper, bch2_disk_accounting_mod2(), that initializes a disk_accounting_pos and does the accounting mod all at once - so overall things actually get slightly more ergonomic. BCH_DISK_ACCOUNTING_replicas keys are left for now; KMSAN isn't warning about them and they're a bit special. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 10 ++--- fs/bcachefs/buckets.c | 66 ++++++++++------------------ fs/bcachefs/disk_accounting.h | 18 ++++++++ fs/bcachefs/disk_accounting_format.h | 12 ++--- fs/bcachefs/inode.c | 7 ++- fs/bcachefs/super.c | 9 ++-- 6 files changed, 57 insertions(+), 65 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 2828baa9b162..5fb396be9127 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -777,14 +777,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s s64 delta_sectors, s64 delta_fragmented, unsigned flags) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = data_type, - }; s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + d, dev_data_type, + .dev = ca->dev_idx, + .data_type = data_type); } int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index bb7742cf0014..e56ef623ebc1 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -724,9 +724,7 @@ static int __trigger_extent(struct btree_trans *trans, .replicas.nr_required = 1, }; - struct disk_accounting_pos acct_compression_key = { - .type = BCH_DISK_ACCOUNTING_compression, - }; + unsigned cur_compression_type = 0; u64 compression_acct[3] = { 1, 0, 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { @@ -760,13 +758,13 @@ static int __trigger_extent(struct btree_trans *trans, acc_replicas_key.replicas.nr_required = 0; } - if (acct_compression_key.compression.type && - acct_compression_key.compression.type != p.crc.compression_type) { + if (cur_compression_type && + cur_compression_type != p.crc.compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; @@ -775,7 +773,7 @@ static int __trigger_extent(struct btree_trans *trans, compression_acct[2] = 0; } - acct_compression_key.compression.type = p.crc.compression_type; + cur_compression_type = p.crc.compression_type; if (p.crc.compression_type) { compression_acct[1] += p.crc.uncompressed_size; compression_acct[2] += p.crc.compressed_size; @@ -789,45 +787,34 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - struct disk_accounting_pos acc_snapshot_key = { - .type = BCH_DISK_ACCOUNTING_snapshot, - .snapshot.id = k.k->p.snapshot, - }; - ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } - if (acct_compression_key.compression.type) { + if (cur_compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; } if (level) { - struct disk_accounting_pos acc_btree_key = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = btree_id, - }; - ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct disk_accounting_pos acc_inum_key = { - .type = BCH_DISK_ACCOUNTING_inum, - .inum.inum = k.k->p.inode, - }; + s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), *replicas_sectors, }; - ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); + ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) return ret; } @@ -876,15 +863,15 @@ int bch2_trigger_extent(struct btree_trans *trans, } int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta = 0; + s64 need_rebalance_sectors_delta[1] = { 0 }; s64 s = bch2_bkey_sectors_need_rebalance(c, old); need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta -= s; + need_rebalance_sectors_delta[0] -= s; s = bch2_bkey_sectors_need_rebalance(c, new.s_c); need_rebalance_delta += s != 0; - need_rebalance_sectors_delta += s; + need_rebalance_sectors_delta[0] += s; if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, @@ -893,12 +880,9 @@ int bch2_trigger_extent(struct btree_trans *trans, return ret; } - if (need_rebalance_sectors_delta) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_rebalance_work, - }; - int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, - flags & BTREE_TRIGGER_gc); + if (need_rebalance_sectors_delta[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + need_rebalance_sectors_delta, rebalance_work); if (ret) return ret; } @@ -914,17 +898,13 @@ static int __trigger_reservation(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 sectors = k.k->size; + s64 sectors[1] = { k.k->size }; if (flags & BTREE_TRIGGER_overwrite) - sectors = -sectors; + sectors[0] = -sectors[0]; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_persistent_reserved, - .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas, - }; - - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, + persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); } return 0; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index f4372cafea2e..f9214e2d1346 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -85,6 +85,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); + +#define disk_accounting_key_init(_k, _type, ...) \ +do { \ + memset(&(_k), 0, sizeof(_k)); \ + (_k).type = BCH_DISK_ACCOUNTING_##_type; \ + (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ +} while (0) + +#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ +({ \ + struct disk_accounting_pos pos; \ + disk_accounting_key_init(pos, __VA_ARGS__); \ + bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ +}) + +#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ + bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) + int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 7b6e6c97e6aa..15190196485f 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -113,14 +113,14 @@ enum disk_accounting_type { BCH_DISK_ACCOUNTING_TYPE_NR, }; -struct bch_nr_inodes { +struct bch_acct_nr_inodes { }; -struct bch_persistent_reserved { +struct bch_acct_persistent_reserved { __u8 nr_replicas; }; -struct bch_dev_data_type { +struct bch_acct_dev_data_type { __u8 dev; __u8 data_type; }; @@ -149,10 +149,10 @@ struct disk_accounting_pos { struct { __u8 type; union { - struct bch_nr_inodes nr_inodes; - struct bch_persistent_reserved persistent_reserved; + struct bch_acct_nr_inodes nr_inodes; + struct bch_acct_persistent_reserved persistent_reserved; struct bch_replicas_entry_v1 replicas; - struct bch_dev_data_type dev_data_type; + struct bch_acct_dev_data_type dev_data_type; struct bch_acct_compression compression; struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 1383fdcc42a5..80051073f613 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -731,10 +731,9 @@ int bch2_trigger_inode(struct btree_trans *trans, bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); - if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { - struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; - int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); + s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; + if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); if (ret) return ret; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index d662adfbdbcc..99f9a0aaa380 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1990,15 +1990,12 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = BCH_DATA_free, - }; u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) goto err; From 0b4fd567261bc21ba1fd8636489396f0940b54f8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 14:22:39 -0400 Subject: [PATCH 177/180] bcachefs: btree_trans_restart_foreign_task() In debug mode, we save the call stack on transaction restart - but there's no locking, so we can't touch it if we're issuing the restart from another thread. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 9 ++++++++- fs/bcachefs/btree_locking.c | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b96157f3dc9c..8823eec6b284 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -335,13 +335,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra } __always_inline -static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) { BUG_ON(err <= 0); BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; trans->last_restarted_ip = ip; + return -err; +} + +__always_inline +static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +{ + btree_trans_restart_foreign_task(trans, err, ip); #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index caef65adeae4..b18fbf6f6226 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -172,7 +172,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { trace_would_deadlock(g, i->trans); - return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + return btree_trans_restart_foreign_task(i->trans, + BCH_ERR_transaction_restart_would_deadlock, + _THIS_IP_); } else { i->trans->lock_must_abort = true; wake_up_process(i->trans->locking_wait.task); From 739200c57384313e688e6945b56782721c29459f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 21:53:41 -0400 Subject: [PATCH 178/180] bcachefs: Fix race in print_chain() 00636 Unable to handle kernel NULL pointer dereference at virtual address 00000000000000b0 00636 Mem abort info: 00636 ESR = 0x0000000096000005 00636 EC = 0x25: DABT (current EL), IL = 32 bits 00636 SET = 0, FnV = 0 00636 EA = 0, S1PTW = 0 00636 FSC = 0x05: level 1 translation fault 00636 Data abort info: 00636 ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 00636 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 00636 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 00636 user pgtable: 4k pages, 39-bit VAs, pgdp=0000000101b10000 00636 [00000000000000b0] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000 00636 Internal error: Oops: 0000000096000005 [#1] SMP 00636 Modules linked in: 00636 CPU: 12 UID: 0 PID: 79369 Comm: cat Not tainted 6.14.0-rc6-ktest-g3783b8973ab7 #17757 00636 Hardware name: linux,dummy-virt (DT) 00636 pstate: 20001005 (nzCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 00636 pc : print_chain+0xb8/0x170 00636 lr : print_chain+0xa0/0x170 00636 sp : ffffff80d9c1bbb0 00636 x29: ffffff80d9c1bbb0 x28: 0000000000000002 x27: ffffff80c1be8250 00636 x26: ffffff80dd9b0000 x25: 0000000000000020 x24: 000000000000002d 00636 x23: 000000000000003c x22: ffffffc080a54518 x21: ffffff80da6e00d0 00636 x20: ffffff80da6e0170 x19: ffffff80c1a1d240 x18: 00000000ffffffff 00636 x17: 3535303937202d3c x16: 203139202d3c2035 x15: 00000000ffffffff 00636 x14: 0000000000000000 x13: ffffff80d71b63f1 x12: 0000000000000006 00636 x11: ffffffc080beb1c0 x10: 0000000000000020 x9 : 00000000000134cc 00636 x8 : 0000000000000020 x7 : 0000000000000004 x6 : 0000000000000020 00636 x5 : ffffff80d71b63f7 x4 : ffffffc080a5451b x3 : 0000000000000000 00636 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000 00636 Call trace: 00636 print_chain+0xb8/0x170 (P) 00636 bch2_check_for_deadlock+0x444/0x5a0 00636 bch2_btree_deadlock_read+0xb4/0x1c8 00636 full_proxy_read+0x74/0xd8 00636 vfs_read+0x90/0x300 Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index b18fbf6f6226..94eb2b73a843 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -91,10 +91,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { - struct task_struct *task = i->trans->locking_wait.task; + struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", task ?task->pid : 0); + prt_printf(out, "%u ", task ? task->pid : 0); } prt_newline(out); } From 2adfa467347f6e5d8091ecbc45a78cac3d2a2b91 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Mar 2025 16:26:32 -0400 Subject: [PATCH 179/180] bcachefs: btree node write errors now print btree node It turned out a user was wondering why we were going read-only after a write error, and he didn't realize he didn't have replication enabled - this will make that more obvious, and we should be printing it anyways. Link: https://www.reddit.com/r/bcachefs/comments/1jf9akl/large_data_transfers_switched_bcachefs_to_readonly/ Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 6abc9f17ea3c..2ba33ffc9795 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -2117,8 +2117,14 @@ static void btree_node_write_work(struct work_struct *work) return; err: set_btree_node_noevict(b); - bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, - "writing btree node: %s", bch2_err_str(ret)); + + if (!bch2_err_matches(ret, EROFS)) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); + bch2_btree_pos_to_text(&buf, c, b); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + } goto out; } @@ -2135,10 +2141,14 @@ static void btree_node_write_endio(struct bio *bio) bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, wbio->submit_time, !bio->bi_status); - if (ca && bio->bi_status) - bch_err_dev_ratelimited(ca, - "btree write error: %s", - bch2_blk_status_to_str(bio->bi_status)); + if (ca && bio->bi_status) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "btree write error: %s\n ", + bch2_blk_status_to_str(bio->bi_status)); + bch2_btree_pos_to_text(&buf, c, b); + bch_err_dev_ratelimited(ca, "%s", buf.buf); + printbuf_exit(&buf); + } if (bio->bi_status) { unsigned long flags; From d8bdc8daac1d1b0a4efb1ecc69bef4eb4fc5e050 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 21:16:50 -0400 Subject: [PATCH 180/180] bcachefs: Kill unnecessary bch2_dev_usage_read() bch2_dev_usage_read() is fairly expensive, we should optimize this more. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 95aafc232290..0cac65347a5d 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -631,7 +631,7 @@ static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, struct bch_dev_usage *usage) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); + u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48;