From 47fe65b105f29ef22f463c17ab8a4c0eeb741045 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Jun 2025 20:22:17 -0400 Subject: [PATCH 01/23] bcachefs: Add missing restart handling to check_topology() The next patch will add logging of the specific error being corrected in repair paths to the journal; this means __bch2_fsck_err() can return transaction restarts in places that previously weren't expecting them. check_topology() is old code that doesn't use btree iterators for btree node locking - it'll have to be rewritten in the future to work online. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 95 ++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 9ddcbe1bda78..e92cf3928c63 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -397,7 +397,11 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct continue; } - ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan); + ret = lockrestart_do(trans, + btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan)); + if (ret < 0) + goto err; + if (ret == DID_FILL_FROM_SCAN) { new_pass = true; ret = 0; @@ -438,7 +442,8 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct if (!ret && !IS_ERR_OR_NULL(prev)) { BUG_ON(cur); - ret = btree_repair_node_end(trans, b, prev, pulled_from_scan); + ret = lockrestart_do(trans, + btree_repair_node_end(trans, b, prev, pulled_from_scan)); if (ret == DID_FILL_FROM_SCAN) { new_pass = true; ret = 0; @@ -519,6 +524,46 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct bch2_bkey_buf_exit(&prev_k, c); bch2_bkey_buf_exit(&cur_k, c); printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +static int bch2_check_root(struct btree_trans *trans, enum btree_id i, + bool *reconstructed_root) +{ + struct bch_fs *c = trans->c; + struct btree_root *r = bch2_btree_id_root(c, i); + struct printbuf buf = PRINTBUF; + int ret = 0; + + bch2_btree_id_to_text(&buf, i); + + if (r->error) { + bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); + + r->alive = false; + r->error = 0; + + if (!bch2_btree_has_scanned_nodes(c, i)) { + __fsck_err(trans, + FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), + btree_root_unreadable_and_scan_found_nothing, + "no nodes found for btree %s, continue?", buf.buf); + bch2_btree_root_alloc_fake_trans(trans, i, 0); + } else { + bch2_btree_root_alloc_fake_trans(trans, i, 1); + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); + if (ret) + goto err; + } + + *reconstructed_root = true; + } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); return ret; } @@ -526,42 +571,18 @@ int bch2_check_topology(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct bpos pulled_from_scan = POS_MIN; - struct printbuf buf = PRINTBUF; int ret = 0; bch2_trans_srcu_unlock(trans); for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - struct btree_root *r = bch2_btree_id_root(c, i); bool reconstructed_root = false; +recover: + ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root)); + if (ret) + break; - printbuf_reset(&buf); - bch2_btree_id_to_text(&buf, i); - - if (r->error) { -reconstruct_root: - bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - - r->alive = false; - r->error = 0; - - if (!bch2_btree_has_scanned_nodes(c, i)) { - __fsck_err(trans, - FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), - btree_root_unreadable_and_scan_found_nothing, - "no nodes found for btree %s, continue?", buf.buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); - } else { - bch2_btree_root_alloc_fake_trans(trans, i, 1); - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); - if (ret) - break; - } - - reconstructed_root = true; - } - + struct btree_root *r = bch2_btree_id_root(c, i); struct btree *b = r->b; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); @@ -575,17 +596,21 @@ int bch2_check_topology(struct bch_fs *c) r->b = NULL; - if (!reconstructed_root) - goto reconstruct_root; + if (!reconstructed_root) { + r->error = -EIO; + goto recover; + } + struct printbuf buf = PRINTBUF; + bch2_btree_id_to_text(&buf, i); bch_err(c, "empty btree root %s", buf.buf); + printbuf_exit(&buf); bch2_btree_root_alloc_fake_trans(trans, i, 0); r->alive = false; ret = 0; } } -fsck_err: - printbuf_exit(&buf); + bch2_trans_put(trans); return ret; } From b43f724927686387589b98eb762e1a4109bb1bac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Jun 2025 13:07:31 -0400 Subject: [PATCH 02/23] bcachefs: Log fsck errors in the journal Log the specific error being corrected in the journal when we're repairing, this helps greatly with 'bcachefs list_journal' analysis. Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 63951e293c47..ff49cebfd0e8 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -620,6 +620,9 @@ int __bch2_fsck_err(struct bch_fs *c, if (s) s->ret = ret; + + if (trans) + ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; err_unlock: mutex_unlock(&c->fsck_error_msgs_lock); err: From c7e351be7aa4e176223f0128e4d1b959ffad9598 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 5 Jun 2025 19:00:11 -0400 Subject: [PATCH 03/23] bcachefs: Add range being updated to btree_update_to_text() We had a deadlock during recovery where interior btree updates became wedged and all open_buckets were consumed; start adding more introspection. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 26 ++++++++++++++++++++++++-- fs/bcachefs/btree_update_interior.h | 7 +++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d2ecb782919b..340812b28d08 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1232,6 +1232,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, list_add_tail(&as->list, &c->btree_interior_update_list); mutex_unlock(&c->btree_interior_update_lock); + struct btree *b = btree_path_node(path, path->level); + as->node_start = b->data->min_key; + as->node_end = b->data->max_key; + as->node_needed_rewrite = btree_node_need_rewrite(b); + as->node_written = b->written; + as->node_sectors = btree_buf_bytes(b) >> 9; + as->node_remaining = __bch2_btree_u64s_remaining(b, + btree_bkey_last(b, bset_tree_last(b))); + /* * We don't want to allocate if we're in an error state, that can cause * deadlock on emergency shutdown due to open buckets getting stuck in @@ -2108,6 +2117,9 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (ret) goto err; + as->node_start = prev->data->min_key; + as->node_end = next->data->max_key; + trace_and_count(c, btree_node_merge, trans, b); n = bch2_btree_node_alloc(as, trans, b->c.level); @@ -2681,9 +2693,19 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update prt_str(out, " "); bch2_btree_id_to_text(out, as->btree_id); - prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + prt_printf(out, " l=%u-%u ", as->update_level_start, - as->update_level_end, + as->update_level_end); + bch2_bpos_to_text(out, as->node_start); + prt_char(out, ' '); + bch2_bpos_to_text(out, as->node_end); + prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %u", + as->node_written, + as->node_sectors, + as->node_remaining, + as->node_needed_rewrite); + + prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_update_modes[as->mode], as->nodes_written, closure_nr_remaining(&as->cl), diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7fe793788a79..85682a13de4d 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -57,6 +57,13 @@ struct btree_update { unsigned took_gc_lock:1; enum btree_id btree_id; + struct bpos node_start; + struct bpos node_end; + bool node_needed_rewrite; + u16 node_written; + u16 node_sectors; + u16 node_remaining; + unsigned update_level_start; unsigned update_level_end; From b76cce12700b8bb8d59b9ff444504b94db96dd4b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 5 Jun 2025 20:53:01 -0400 Subject: [PATCH 04/23] bcachefs: Add more flags to btree nodes for rewrite reason It seems excessive forced btree node rewrites can cause interior btree updates to become wedged during recovery, before we're using the write buffer for backpointer updates. Add more flags so we can determine where these are coming from. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 10 ++++++++-- fs/bcachefs/btree_types.h | 29 +++++++++++++++++++++++++++++ fs/bcachefs/btree_update_interior.c | 13 ++++++++++--- fs/bcachefs/btree_update_interior.h | 2 +- 4 files changed, 48 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 57eff3012a7b..6787d5b91eab 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1045,6 +1045,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, le16_add_cpu(&i->u64s, -next_good_key); memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_error(b); } fsck_err: printbuf_exit(&buf); @@ -1305,6 +1306,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_bset_end(b, b->set); set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_error(b); continue; } if (ret) @@ -1329,12 +1331,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_degraded(b); + } } - if (!ptr_written) + if (!ptr_written) { set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_ptr_written_zero(b); + } fsck_err: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index c61c4171ae50..3aa4a602bd02 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -617,6 +617,9 @@ enum btree_write_type { x(dying) \ x(fake) \ x(need_rewrite) \ + x(need_rewrite_error) \ + x(need_rewrite_degraded) \ + x(need_rewrite_ptr_written_zero) \ x(never_write) \ x(pinned) @@ -641,6 +644,32 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \ BTREE_FLAGS() #undef x +#define BTREE_NODE_REWRITE_REASON() \ + x(none) \ + x(unknown) \ + x(error) \ + x(degraded) \ + x(ptr_written_zero) + +enum btree_node_rewrite_reason { +#define x(n) BTREE_NODE_REWRITE_##n, + BTREE_NODE_REWRITE_REASON() +#undef x +}; + +static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b) +{ + if (btree_node_need_rewrite_ptr_written_zero(b)) + return BTREE_NODE_REWRITE_ptr_written_zero; + if (btree_node_need_rewrite_degraded(b)) + return BTREE_NODE_REWRITE_degraded; + if (btree_node_need_rewrite_error(b)) + return BTREE_NODE_REWRITE_error; + if (btree_node_need_rewrite(b)) + return BTREE_NODE_REWRITE_unknown; + return BTREE_NODE_REWRITE_none; +} + static inline struct btree_write *btree_current_write(struct btree *b) { return b->writes + btree_node_write_idx(b); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 340812b28d08..e77584607f0d 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1138,6 +1138,13 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * start_time); } +static const char * const btree_node_reawrite_reason_strs[] = { +#define x(n) #n, + BTREE_NODE_REWRITE_REASON() +#undef x + NULL, +}; + static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned level_start, bool split, @@ -1235,7 +1242,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, struct btree *b = btree_path_node(path, path->level); as->node_start = b->data->min_key; as->node_end = b->data->max_key; - as->node_needed_rewrite = btree_node_need_rewrite(b); + as->node_needed_rewrite = btree_node_rewrite_reason(b); as->node_written = b->written; as->node_sectors = btree_buf_bytes(b) >> 9; as->node_remaining = __bch2_btree_u64s_remaining(b, @@ -2699,11 +2706,11 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update bch2_bpos_to_text(out, as->node_start); prt_char(out, ' '); bch2_bpos_to_text(out, as->node_end); - prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %u", + prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s", as->node_written, as->node_sectors, as->node_remaining, - as->node_needed_rewrite); + btree_node_reawrite_reason_strs[as->node_needed_rewrite]); prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_update_modes[as->mode], diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 85682a13de4d..b649c36c3fbb 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -59,7 +59,7 @@ struct btree_update { enum btree_id btree_id; struct bpos node_start; struct bpos node_end; - bool node_needed_rewrite; + enum btree_node_rewrite_reason node_needed_rewrite; u16 node_written; u16 node_sectors; u16 node_remaining; From af5b88618a38371545251ad92ce42bc4bfa1142a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 12:01:22 -0400 Subject: [PATCH 05/23] bcachefs: Update /dev/disk/by-uuid on device add Invalidate pagecache after we write the new superblock and send a uevent. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 397a69da5a75..8648f22411be 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1995,6 +1995,22 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_late; } + /* + * We just changed the superblock UUID, invalidate cache and send a + * uevent to update /dev/disk/by-uuid + */ + invalidate_bdev(ca->disk_sb.bdev); + + char uuid_str[37]; + snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); + + char *envp[] = { + "CHANGE=uuid", + uuid_str, + NULL, + }; + kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); + up_write(&c->state_lock); out: printbuf_exit(&label); From 7b0e6b198e4ef39ec89285d953778c5a3a04d390 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 12:56:15 -0400 Subject: [PATCH 06/23] bcachefs: Mark need_discard_freespace_key_bad autofix Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 6fdbf265e4c0..90c878f289b2 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -134,7 +134,7 @@ enum bch_fsck_flags { x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \ x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ - x(need_discard_freespace_key_bad, 124, 0) \ + x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \ x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ x(backpointer_bucket_offset_wrong, 125, 0) \ x(backpointer_level_bad, 294, 0) \ From b47a82ff4772ea9d7091b85ef5f34dc78c866a02 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 12:56:33 -0400 Subject: [PATCH 07/23] bcachefs: Only run 'increase_depth' for keys from btree node csan bch2_btree_increase_depth() was originally for disaster recovery, to get some data back from the journal when a btree root was bad. We don't need it for that purpose anymore; on bad btree root we'll launch btree node scan and reconstruct all the interior nodes. If there's a key in the journal for a depth that doesn't exists, and it's not from check_topology/btree node scan, we should just ignore it. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 1e68e61f08e8..dbd066260725 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -271,13 +271,24 @@ static int bch2_journal_replay_key(struct btree_trans *trans, goto out; struct btree_path *path = btree_iter_path(trans, &iter); - if (unlikely(!btree_path_node(path, k->level))) { + if (unlikely(!btree_path_node(path, k->level) && + !k->allocated)) { + struct bch_fs *c = trans->c; + + if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| + BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { + bch_err(c, "have key in journal replay for btree depth that does not exist, confused"); + ret = -EINVAL; + } +#if 0 bch2_trans_iter_exit(trans, &iter); bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, 0, iter_flags); ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_increase_depth(trans, iter.path, 0) ?: -BCH_ERR_transaction_restart_nested; +#endif + k->overwritten = true; goto out; } From dd22844f48a71a414bd7e08cd4a9a5847974c07e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 14:22:24 -0400 Subject: [PATCH 08/23] bcachefs: Read error message now prints if self healing Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 11 +++++++++-- fs/bcachefs/io_read.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index a77779afad01..04bbdcf58e40 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -343,6 +343,10 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, *bounce = true; *read_full = promote_full; + + if (have_io_error(failed)) + orig->self_healing = true; + return promote; nopromote: trace_io_read_nopromote(c, ret); @@ -635,12 +639,15 @@ static void bch2_rbio_retry(struct work_struct *work) prt_str(&buf, "(internal move) "); prt_str(&buf, "data read error, "); - if (!ret) + if (!ret) { prt_str(&buf, "successful retry"); - else + if (rbio->self_healing) + prt_str(&buf, ", self healing"); + } else prt_str(&buf, bch2_err_str(ret)); prt_newline(&buf); + if (!bkey_deleted(&sk.k->k)) { bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); prt_newline(&buf); diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 45c959018919..9c5ddbf861b3 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -44,6 +44,7 @@ struct bch_read_bio { have_ioref:1, narrow_crcs:1, saw_error:1, + self_healing:1, context:2; }; u16 _state; From 263561649ee5875a922f4358e96d3deb17a91021 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 14:27:35 -0400 Subject: [PATCH 09/23] bcachefs: Don't persistently run scan_for_btree_nodes bch2_btree_lost_data() gets called on btree node read error, but the error might be transient. btree_node_scan is expensive, and there's no need to run it persistently (marking it in the superblock as required to run) - check_topology will run it if required, via bch2_get_scanned_nodes(). Running it non-persistently is fine, to avoid check_topology having to rewind recovery to run it. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 2 ++ fs/bcachefs/recovery_passes.c | 14 +++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index dbd066260725..06d0ba0fbffb 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -99,9 +99,11 @@ int bch2_btree_lost_data(struct bch_fs *c, goto out; case BTREE_ID_snapshots: ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; default: + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; } diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 605588e33fb3..35ac0d64d73a 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -294,8 +294,13 @@ static bool recovery_pass_needs_set(struct bch_fs *c, enum bch_run_recovery_pass_flags *flags) { struct bch_fs_recovery *r = &c->recovery; - bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); + + /* + * Never run scan_for_btree_nodes persistently: check_topology will run + * it if required + */ + if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) + *flags |= RUN_RECOVERY_PASS_nopersistent; if ((*flags & RUN_RECOVERY_PASS_ratelimit) && !bch2_recovery_pass_want_ratelimit(c, pass)) @@ -310,6 +315,8 @@ static bool recovery_pass_needs_set(struct bch_fs *c, * Otherwise, we run run_explicit_recovery_pass when we find damage, so * it should run again even if it's already run: */ + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); if (persistent ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) @@ -334,6 +341,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, struct bch_fs_recovery *r = &c->recovery; int ret = 0; + lockdep_assert_held(&c->sb_lock); bch2_printbuf_make_room(out, 1024); @@ -446,7 +454,7 @@ int bch2_require_recovery_pass(struct bch_fs *c, int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent; + enum bch_run_recovery_pass_flags flags = 0; if (!recovery_pass_needs_set(c, pass, &flags)) return 0; From 3315113af178041ab474723804e1322cee7d0a97 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 18:56:28 -0400 Subject: [PATCH 10/23] bcachefs: mark more errors autofix Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 90c878f289b2..d06e73884871 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -165,7 +165,7 @@ enum bch_fsck_flags { x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \ x(ptr_to_missing_stripe, 150, 0) \ x(ptr_to_incorrect_stripe, 151, 0) \ - x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \ x(ptr_too_stale, 153, 0) \ x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ x(ptr_bucket_data_type_mismatch, 155, 0) \ @@ -236,7 +236,7 @@ enum bch_fsck_flags { x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ - x(inode_has_child_snapshots_wrong, 287, 0) \ + x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ @@ -279,8 +279,8 @@ enum bch_fsck_flags { x(root_dir_missing, 239, 0) \ x(root_inode_not_dir, 240, 0) \ x(dir_loop, 241, 0) \ - x(hash_table_key_duplicate, 242, 0) \ - x(hash_table_key_wrong_offset, 243, 0) \ + x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \ + x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \ x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ x(reflink_p_front_pad_bad, 245, 0) \ x(journal_entry_dup_same_device, 246, 0) \ From 0acb385ec19c99b213d28a309a941790758c53a8 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 20 May 2025 15:34:28 +0800 Subject: [PATCH 11/23] bcachefs: Fix possible console lock involved deadlock Link: https://lore.kernel.org/all/6822ab02.050a0220.f2294.00cb.GAE@google.com/T/ Reported-by: syzbot+2c3ef91c9523c3d1a25c@syzkaller.appspotmail.com Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/btree_locking.c | 2 +- fs/bcachefs/error.c | 2 +- fs/bcachefs/super.c | 11 +++-------- fs/bcachefs/util.c | 10 ++-------- fs/bcachefs/util.h | 2 +- 6 files changed, 8 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 3651a296d506..5a1cede2febf 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -296,7 +296,6 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") void bch2_print_str(struct bch_fs *, const char *, const char *); -void bch2_print_str_nonblocking(struct bch_fs *, const char *, const char *); __printf(2, 3) void bch2_print_opts(struct bch_opts *, const char *, ...); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 47035aae232e..91a51aef82f1 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -213,7 +213,7 @@ static noinline __noreturn void break_cycle_fail(struct lock_graph *g) prt_newline(&buf); } - bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf); + bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf); printbuf_exit(&buf); BUG(); } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index ff49cebfd0e8..a8ec6aae5738 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -69,7 +69,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra if (trans) bch2_trans_updates_to_text(&buf, trans); bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_str_nonblocking(c, KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 8648f22411be..ae8fa58a208d 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -104,7 +104,7 @@ const char * const bch2_dev_write_refs[] = { #undef x static void __bch2_print_str(struct bch_fs *c, const char *prefix, - const char *str, bool nonblocking) + const char *str) { #ifdef __KERNEL__ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); @@ -114,17 +114,12 @@ static void __bch2_print_str(struct bch_fs *c, const char *prefix, return; } #endif - bch2_print_string_as_lines(KERN_ERR, str, nonblocking); + bch2_print_string_as_lines(KERN_ERR, str); } void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) { - __bch2_print_str(c, prefix, str, false); -} - -void bch2_print_str_nonblocking(struct bch_fs *c, const char *prefix, const char *str) -{ - __bch2_print_str(c, prefix, str, true); + __bch2_print_str(c, prefix, str); } __printf(2, 0) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index dc3817f545fa..df9a6071fe18 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -262,8 +262,7 @@ static bool string_is_spaces(const char *str) return true; } -void bch2_print_string_as_lines(const char *prefix, const char *lines, - bool nonblocking) +void bch2_print_string_as_lines(const char *prefix, const char *lines) { bool locked = false; const char *p; @@ -273,12 +272,7 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines, return; } - if (!nonblocking) { - console_lock(); - locked = true; - } else { - locked = console_trylock(); - } + locked = console_trylock(); while (*lines) { p = strchrnul(lines, '\n'); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 0a4b1d433621..6488f098d140 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -214,7 +214,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]); void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); -void bch2_print_string_as_lines(const char *, const char *, bool); +void bch2_print_string_as_lines(const char *, const char *); typedef DARRAY(unsigned long) bch_stacktrace; int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); From f946ce0be45e75801583a5371fccf7466961d9d0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 20:18:16 -0400 Subject: [PATCH 12/23] bcachefs: Make sure opts.read_only gets propagated back to VFS If we think we're read-only but the VFS doesn't, fun will ensue. And now that we know we have to be able to do this safely, just make nochanges imply ro. Reported-by: syzbot+a7d6ceaba099cc21dee4@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 8 ++++++++ fs/bcachefs/recovery.c | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 85d13f800165..3063a8ddc2df 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2490,6 +2490,14 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (ret) goto err_stop_fs; + /* + * We might be doing a RO mount because other options required it, or we + * have no alloc info and it's a small image with no room to regenerate + * it + */ + if (c->opts.read_only) + fc->sb_flags |= SB_RDONLY; + sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); ret = PTR_ERR_OR_ZERO(sb); if (ret) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 06d0ba0fbffb..520eb72c4971 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -752,9 +752,11 @@ int bch2_fs_recovery(struct bch_fs *c) ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) : BCH_RECOVERY_PASS_snapshots_read; c->opts.nochanges = true; - c->opts.read_only = true; } + if (c->opts.nochanges) + c->opts.read_only = true; + mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; From 757601ef853359fe2d57d75c00b5045f62efc608 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Jun 2025 11:40:00 -0400 Subject: [PATCH 13/23] bcachefs: Don't put rhashtable on stack Object debugging generally needs special provisions for putting said objects on the stack, which rhashtable does not have. Reported-by: syzbot+bcc38a9556d0324c2ec2@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/movinggc.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 6d7b1d5f7697..27e68d470ad0 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -28,7 +28,7 @@ #include struct buckets_in_flight { - struct rhashtable table; + struct rhashtable *table; struct move_bucket *first; struct move_bucket *last; size_t nr; @@ -98,7 +98,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, static void move_bucket_free(struct buckets_in_flight *list, struct move_bucket *b) { - int ret = rhashtable_remove_fast(&list->table, &b->hash, + int ret = rhashtable_remove_fast(list->table, &b->hash, bch_move_bucket_params); BUG_ON(ret); kfree(b); @@ -133,7 +133,7 @@ static void move_buckets_wait(struct moving_context *ctxt, static bool bucket_in_flight(struct buckets_in_flight *list, struct move_bucket_key k) { - return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); + return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); } static int bch2_copygc_get_buckets(struct moving_context *ctxt, @@ -185,7 +185,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, goto err; } - ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash, + ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, bch_move_bucket_params); BUG_ON(ret2); @@ -350,10 +350,13 @@ static int bch2_copygc_thread(void *arg) struct buckets_in_flight buckets = {}; u64 last, wait; - int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); + buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL); + int ret = !buckets.table + ? -ENOMEM + : rhashtable_init(buckets.table, &bch_move_bucket_params); bch_err_msg(c, ret, "allocating copygc buckets in flight"); if (ret) - return ret; + goto err; set_freezable(); @@ -421,11 +424,12 @@ static int bch2_copygc_thread(void *arg) } move_buckets_wait(&ctxt, &buckets, true); - rhashtable_destroy(&buckets.table); + rhashtable_destroy(buckets.table); bch2_moving_ctxt_exit(&ctxt); bch2_move_stats_exit(&move_stats, c); - - return 0; +err: + kfree(buckets.table); + return ret; } void bch2_copygc_stop(struct bch_fs *c) From 082c74411491f8b0d31465fc104b8342e66c4056 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Jun 2025 11:58:59 -0400 Subject: [PATCH 14/23] bcachefs: Fix downgrade_table_extra() Fix a UAF: we were calling darray_make_room() and retaining a pointer to the old buffer. And fix an UBSAN warning: struct bch_sb_field_downgrade_entry uses __counted_by, so set dst->nr_errors before assigning to the array entry. Reported-by: syzbot+14c52d86ddbd89bea13e@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-downgrade.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index b61f88450a6d..1506d05e0665 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -253,6 +253,7 @@ DOWNGRADE_TABLE() static int downgrade_table_extra(struct bch_fs *c, darray_char *table) { + unsigned dst_offset = table->nr; struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table); unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); int ret = 0; @@ -268,6 +269,9 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table) if (ret) return ret; + dst = (void *) &table->data[dst_offset]; + dst->nr_errors = cpu_to_le16(nr_errors + 1); + /* open coded __set_bit_le64, as dst is packed and * dst->recovery_passes is misaligned */ unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations; @@ -278,7 +282,6 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table) break; } - dst->nr_errors = cpu_to_le16(nr_errors); return ret; } From 54aacfe3976893ee04582a0bd61967bbee5a7e04 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Jun 2025 12:17:02 -0400 Subject: [PATCH 15/23] bcachefs: Fix rcu_pending for PREEMPT_RT PREEMPT_RT redefines how standard spinlocks work, so local_irq_save() + spin_lock() is no longer equivalent to spin_lock_irqsave(). Fortunately, we don't strictly need to do it that way. Signed-off-by: Kent Overstreet --- fs/bcachefs/rcu_pending.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c index bef2aa1b8bcd..b1438be9d690 100644 --- a/fs/bcachefs/rcu_pending.c +++ b/fs/bcachefs/rcu_pending.c @@ -182,11 +182,6 @@ static inline void kfree_bulk(size_t nr, void ** p) while (nr--) kfree(*p); } - -#define local_irq_save(flags) \ -do { \ - flags = 0; \ -} while (0) #endif static noinline void __process_finished_items(struct rcu_pending *pending, @@ -429,9 +424,15 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); - local_irq_save(flags); - p = this_cpu_ptr(pending->p); - spin_lock(&p->lock); + /* We could technically be scheduled before taking the lock and end up + * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock + * anyways + * + * And we have to do it this way to avoid breaking PREEMPT_RT, which + * redefines how spinlocks work: + */ + p = raw_cpu_ptr(pending->p); + spin_lock_irqsave(&p->lock, flags); rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); restart: if (may_sleep && @@ -520,9 +521,8 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, goto free_node; } - local_irq_save(flags); - p = this_cpu_ptr(pending->p); - spin_lock(&p->lock); + p = raw_cpu_ptr(pending->p); + spin_lock_irqsave(&p->lock, flags); goto restart; } From 9e48f574e55731106b26dc33d8b0be1adedf3f20 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 9 Jun 2025 17:30:40 -0400 Subject: [PATCH 16/23] bcachefs: Fix leak in bch2_fs_recovery() error path Fix a small leak of the superblock 'clean' section. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 520eb72c4971..0b21fa6ff062 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1108,9 +1108,6 @@ int bch2_fs_recovery(struct bch_fs *c) out: bch2_flush_fsck_errs(c); - if (!IS_ERR(clean)) - kfree(clean); - if (!ret && test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && !c->opts.nochanges) { @@ -1119,6 +1116,9 @@ int bch2_fs_recovery(struct bch_fs *c) } bch_err_fn(c, ret); +final_out: + if (!IS_ERR(clean)) + kfree(clean); return ret; err: fsck_err: @@ -1132,7 +1132,7 @@ int bch2_fs_recovery(struct bch_fs *c) bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } - return ret; + goto final_out; } int bch2_fs_initialize(struct bch_fs *c) From c3dd25319c1818e067f41f41127f935f153498e6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 9 Jun 2025 17:28:00 -0400 Subject: [PATCH 17/23] bcachefs: Don't pass trans to fsck_err() in gc_accounting_done fsck_err() can return a transaction restart if passed a transaction object - this has always been true when it has to drop locks to prompt for user input, but we're seeing this more now that we're logging the error being corrected in the journal. gc_accounting_done() doesn't call fsck_err() from an actual commit loop, and it doesn't need to be holding btree locks when it calls fsck_err(), so the easy fix here for the unhandled transaction restart is to just not pass it the transaction object. We'll miss out on the fancy new logging, but that's ok. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 3d59a57a5256..f7528cd69c73 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -618,7 +618,9 @@ int bch2_gc_accounting_done(struct bch_fs *c) for (unsigned j = 0; j < nr; j++) src_v[j] -= dst_v[j]; - if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { + bch2_trans_unlock_long(trans); + + if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) { percpu_up_write(&c->mark_lock); ret = commit_do(trans, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); From e82b3a63a9a91cc5c585efb3e28f32100f24e3e1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Jun 2025 11:24:04 +0200 Subject: [PATCH 18/23] bcachefs: ioctl: avoid stack overflow warning Multiple ioctl handlers individually use a lot of stack space, and clang chooses to inline them into the bch2_fs_ioctl() function, blowing through the warning limit: fs/bcachefs/chardev.c:655:6: error: stack frame size (1032) exceeds limit (1024) in 'bch2_fs_ioctl' [-Werror,-Wframe-larger-than] 655 | long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) By marking the largest two of them as noinline_for_stack, no indidual code path ends up using this much, which avoids the warning and reduces the possible total stack usage in the ioctl handler. Signed-off-by: Arnd Bergmann Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 2d38466eddfd..fde3c2380e28 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -399,7 +399,7 @@ static long bch2_ioctl_data(struct bch_fs *c, return ret; } -static long bch2_ioctl_fs_usage(struct bch_fs *c, +static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c, struct bch_ioctl_fs_usage __user *user_arg) { struct bch_ioctl_fs_usage arg = {}; @@ -469,7 +469,7 @@ static long bch2_ioctl_query_accounting(struct bch_fs *c, } /* obsolete, didn't allow for new data types: */ -static long bch2_ioctl_dev_usage(struct bch_fs *c, +static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_ioctl_dev_usage __user *user_arg) { struct bch_ioctl_dev_usage arg; From 625c494db95624f09f5e7b81b64d4da34e45bd2a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 11 Jun 2025 13:32:58 -0400 Subject: [PATCH 19/23] bcachefs: Fix version checks in validate_bset() It seems btree node scan picked up a partially overwritten btree node, and corrected the "bset version older than sb version_min" error - resulting in an invalid superblock with a bad version_min field. Don't run this check at all when we're in btree node scan, and when we do run it, do something saner if the bset version is totally crazy. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 6787d5b91eab..d8f3c4c65e90 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -741,16 +741,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BCH_VERSION_MAJOR(version), BCH_VERSION_MINOR(version)); - if (btree_err_on(version < c->sb.version_min, + if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes && + btree_err_on(version < c->sb.version_min, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, NULL, btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version_min = cpu_to_le16(version); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + if (bch2_version_compatible(version)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version_min = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } else { + /* We have no idea what's going on: */ + i->version = cpu_to_le16(c->sb.version); + } } if (btree_err_on(BCH_VERSION_MAJOR(version) > From 205da7c026739d965e605d39001049c7b6728e87 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Jun 2025 11:31:23 -0400 Subject: [PATCH 20/23] bcachefs: Don't trust sb->nr_devices in members_to_text() We have to be able to print superblock sections even if they fail to validate (for debugging), so we have to calculate the number of entries from the field size. Reported-by: syzbot+5138f00559ffb3cb3610@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-members.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 363eb0c6eb7c..6245e342a8a8 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -325,9 +325,17 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - unsigned i; - for (i = 0; i < sb->nr_devices; i++) + if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { + prt_printf(out, "field ends before start of entries"); + return; + } + + unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]); + if (nr != sb->nr_devices) + prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); + + for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) member_to_text(out, members_v1_get(mi, i), gi, sb, i); } @@ -341,9 +349,27 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - unsigned i; - for (i = 0; i < sb->nr_devices; i++) + if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { + prt_printf(out, "field ends before start of entries"); + return; + } + + if (!le16_to_cpu(mi->member_bytes)) { + prt_printf(out, "member_bytes 0"); + return; + } + + unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes); + if (nr != sb->nr_devices) + prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); + + /* + * We call to_text() on superblock sections that haven't passed + * validate, so we can't trust sb->nr_devices. + */ + + for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) member_to_text(out, members_v2_get(mi, i), gi, sb, i); } From b68baf9a87330148d3ad074e48503a4e9c5c3929 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 11 Jun 2025 15:57:48 -0400 Subject: [PATCH 21/23] bcachefs: Print devices we're mounting on multi device filesystems Previously, we only ever logged the filesystem UUID. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index ae8fa58a208d..a5b97c9c5163 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1067,12 +1067,13 @@ noinline_for_stack static void print_mount_opts(struct bch_fs *c) { enum bch_opt_id i; - struct printbuf p = PRINTBUF; - bool first = true; + CLASS(printbuf, p)(); + bch2_log_msg_start(c, &p); prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); + bool first = true; for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); @@ -1089,17 +1090,24 @@ static void print_mount_opts(struct bch_fs *c) } if (c->sb.version_incompat_allowed != c->sb.version) { - prt_printf(&p, "\n allowing incompatible features above "); + prt_printf(&p, "\nallowing incompatible features above "); bch2_version_to_text(&p, c->sb.version_incompat_allowed); } if (c->opts.verbose) { - prt_printf(&p, "\n features: "); + prt_printf(&p, "\nfeatures: "); prt_bitflags(&p, bch2_sb_features, c->sb.features); } - bch_info(c, "%s", p.buf); - printbuf_exit(&p); + if (c->sb.multi_device) { + prt_printf(&p, "\nwith devices"); + for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { + prt_char(&p, ' '); + prt_str(&p, ca->name); + } + } + + bch2_print_str(c, KERN_INFO, p.buf); } static bool bch2_fs_may_start(struct bch_fs *c) From cd1124244be30fd3e87da9186508aab371e9307d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Jun 2025 12:35:20 -0400 Subject: [PATCH 22/23] bcachefs: Ensure that snapshot creation propagates has_case_insensitive We normally can't create a new directory with the case-insensitive option already set - except when we're creating a snapshot. And if casefolding is enabled filesystem wide, we should still set it even though not strictly required, for consistency. Signed-off-by: Kent Overstreet --- fs/bcachefs/namei.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 24120037c031..779c22eb3979 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -175,6 +175,16 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } + if (S_ISDIR(mode)) { + ret = bch2_maybe_propagate_has_case_insensitive(trans, + (subvol_inum) { + new_inode->bi_subvol ?: dir.subvol, + new_inode->bi_inum }, + new_inode); + if (ret) + goto err; + } + if (S_ISDIR(mode) && !new_inode->bi_subvol) new_inode->bi_depth = dir_u->bi_depth + 1; From aef22f6fe7a630d536f9eaa0a7a2ed0f90ea369e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Jun 2025 22:32:14 -0400 Subject: [PATCH 23/23] bcachefs: Don't trace should_be_locked unless changing Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 9adca77e2580..f2173a3316f4 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -417,8 +417,10 @@ static inline void btree_path_set_should_be_locked(struct btree_trans *trans, st EBUG_ON(!btree_node_locked(path, path->level)); EBUG_ON(path->uptodate); - path->should_be_locked = true; - trace_btree_path_should_be_locked(trans, path); + if (!path->should_be_locked) { + path->should_be_locked = true; + trace_btree_path_should_be_locked(trans, path); + } } static inline void __btree_path_set_level_up(struct btree_trans *trans,