From 0e5f1f3f8fad0d195099e4a8e7c43ffe71676047 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 29 Mar 2025 17:59:30 -0400 Subject: [PATCH 001/218] bcachefs: bch2_subvolume_wait_for_pagecache_and_delete() cleanup Signed-off-by: Kent Overstreet --- fs/bcachefs/subvolume.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index d0209f7658bb..239ea783698c 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -479,13 +479,11 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_wait_for_pagecache_and_delete_work); - snapshot_id_list s; - u32 *id; int ret = 0; while (!ret) { mutex_lock(&c->snapshots_unlinked_lock); - s = c->snapshots_unlinked; + snapshot_id_list s = c->snapshots_unlinked; darray_init(&c->snapshots_unlinked); mutex_unlock(&c->snapshots_unlinked_lock); @@ -494,7 +492,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor bch2_evict_subvolume_inodes(c, &s); - for (id = s.data; id < s.data + s.nr; id++) { + darray_for_each(s, id) { ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); bch_err_msg(c, ret, "deleting subvolume %u", *id); if (ret) From 6659ba3b18f71282c7c54f3bffcfecfac73f202f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Mar 2025 13:33:41 -0400 Subject: [PATCH 002/218] bcachefs: Be precise about bch_io_failures If the extent we're reading from changes, due to be being overwritten or moved (possibly partially) - we need to reset bch_io_failures so that we don't accidentally mark a new extent as poisoned prematurely. This means we have to separately track (in the retry path) the extent we previously read from. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.h | 1 + fs/bcachefs/io_read.c | 51 ++++++++++++++++++++++++++++++++++++++++--- fs/bcachefs/io_read.h | 5 +++-- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 054e2d5e8448..082632905649 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) { return bpos_eq(l.k->p, r.k->p) && + l.k->size == r.k->size && bkey_bytes(l.k) == bkey_bytes(r.k) && !memcmp(l.v, r.v, bkey_val_bytes(l.k)); } diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index def4a26a3b45..3705b606f675 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -296,6 +296,13 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, bool *read_full, struct bch_io_failures *failed) { + /* + * We're in the retry path, but we don't know what to repair yet, and we + * don't want to do a promote here: + */ + if (failed && !failed->nr) + return NULL; + struct bch_fs *c = trans->c; /* * if failed != NULL we're not actually doing a promote, we're @@ -430,6 +437,28 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } +static void get_rbio_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bkey_buf *sk) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + rbio->data_btree, rbio->data_pos, 0))); + if (ret) + return; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) + if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { + bch2_bkey_buf_reassemble(sk, trans->c, k); + break; + } + + bch2_trans_iter_exit(trans, &iter); +} + static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, @@ -491,11 +520,18 @@ static void bch2_rbio_retry(struct work_struct *work) struct btree_trans *trans = bch2_trans_get(c); + struct bkey_buf sk; + bch2_bkey_buf_init(&sk); + bkey_init(&sk.k->k); + trace_io_read_retry(&rbio->bio); this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], bvec_iter_sectors(rbio->bvec_iter)); - if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + get_rbio_extent(trans, rbio, &sk); + + if (!bkey_deleted(&sk.k->k) && + bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(&failed, &rbio->pick, rbio->ret == -BCH_ERR_data_read_retry_csum_err); @@ -516,7 +552,7 @@ static void bch2_rbio_retry(struct work_struct *work) int ret = rbio->data_update ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) - : __bch2_read(trans, rbio, iter, inum, &failed, flags); + : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); if (ret) { rbio->ret = ret; @@ -539,6 +575,7 @@ static void bch2_rbio_retry(struct work_struct *work) } bch2_rbio_done(rbio); + bch2_bkey_buf_exit(&sk, c); bch2_trans_put(trans); } @@ -1265,7 +1302,9 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, + struct bkey_buf *prev_read, + unsigned flags) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -1313,6 +1352,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, k = bkey_i_to_s_c(sk.k); + if (unlikely(flags & BCH_READ_in_retry)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) + failed->nr = 0; + bch2_bkey_buf_copy(prev_read, c, sk.k); + } + /* * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index c78025d863e0..1a85b092fd1d 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -144,7 +144,8 @@ static inline void bch2_read_extent(struct btree_trans *trans, } int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); + subvol_inum, + struct bch_io_failures *, struct bkey_buf *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) @@ -154,7 +155,7 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->subvol = inum.subvol; bch2_trans_run(c, - __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, BCH_READ_retry_if_stale| BCH_READ_may_promote| BCH_READ_user_mapped)); From 760be1ad5e71b3a23644849cbf3c2245c4dc83f3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Mar 2025 14:03:25 -0400 Subject: [PATCH 003/218] bcachefs: Poison extents that can't be read due to checksum errors Copygc needs to be able to move extents that have bitrotted. We don't want to delete them - in the future we'll have an API for "read me the data even if there's checksum errors", and in general we don't want to delete anything unless the user asks us to. That will require writing it with a new checksum, which means we can't forget that there was a checksum error so we return the correct error to userspace. Rebalance also wants to skip bad extents; we can now use the poison flag for that. This is currently disabled by default, as we want read fua support so that we can distinguish between transient and permanent errors from the device. It may be enabled with the module parameter: poison_extents_on_checksum_error Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 71 ++++++++++++++++++++++++++++++-- fs/bcachefs/sb-counters_format.h | 1 + fs/bcachefs/trace.h | 5 +++ 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 3705b606f675..3f111f918345 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -34,6 +34,12 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(read_corrupt_ratio, ""); #endif +static bool bch2_poison_extents_on_checksum_error; +module_param_named(poison_extents_on_checksum_error, + bch2_poison_extents_on_checksum_error, bool, 0644); +MODULE_PARM_DESC(poison_extents_on_checksum_error, + "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -459,6 +465,52 @@ static void get_rbio_extent(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); } +static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + enum btree_id btree, struct bkey_s_c read_k) +{ + if (!bch2_poison_extents_on_checksum_error) + return 0; + + struct bch_fs *c = trans->c; + + struct data_update *u = rbio_data_update(rbio); + if (u) + read_k = bkey_i_to_s_c(u->k.k); + + u64 flags = bch2_bkey_extent_flags(read_k); + if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), + BTREE_ITER_intent); + int ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_and_val_eq(k, read_k)) + goto out; + + struct bkey_i *new = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); + ret = PTR_ERR_OR_ZERO(new) ?: + (bkey_reassemble(new, k), 0) ?: + bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: + bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + + /* + * Propagate key change back to data update path, in particular so it + * knows the extent has been poisoned and it's safe to change the + * checksum + */ + if (u && !ret) + bch2_bkey_buf_copy(&u->k, c, new); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, @@ -492,7 +544,8 @@ static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, err: bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; if (ret) { @@ -1008,6 +1061,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto hole; if (unlikely(ret < 0)) { + if (ret == -BCH_ERR_data_read_csum_err) { + int ret2 = maybe_poison_extent(trans, orig, data_btree, k); + if (ret2) { + ret = ret2; + goto err; + } + + trace_and_count(c, io_read_fail_and_poison, &orig->bio); + } + struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "%s\n ", bch2_err_str(ret)); @@ -1310,6 +1373,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + enum btree_id data_btree; int ret; EBUG_ON(rbio->data_update); @@ -1320,7 +1384,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, BTREE_ITER_slots); while (1) { - enum btree_id data_btree = BTREE_ID_extents; + data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1392,8 +1456,6 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, break; } - bch2_trans_iter_exit(trans, &iter); - if (unlikely(ret)) { if (ret != -BCH_ERR_extent_poisoned) { struct printbuf buf = PRINTBUF; @@ -1412,6 +1474,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, bch2_rbio_done(rbio); } + bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); return ret; } diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index fa27ec59a647..5c4e5de79d81 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -16,6 +16,7 @@ enum counters_flags { x(io_read_split, 33, TYPE_COUNTER) \ x(io_read_reuse_race, 34, TYPE_COUNTER) \ x(io_read_retry, 32, TYPE_COUNTER) \ + x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ x(io_move_read, 35, TYPE_SECTORS) \ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 519d00d62ae7..8c07189a080a 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -339,6 +339,11 @@ DEFINE_EVENT(bio, io_read_reuse_race, TP_ARGS(bio) ); +DEFINE_EVENT(bio, io_read_fail_and_poison, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + /* ec.c */ TRACE_EVENT(stripe_create, From cb8336ca42e493a76b3cc05b76a51a2eed26cdaa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Mar 2025 10:02:40 -0400 Subject: [PATCH 004/218] bcachefs: Data move can read from poisoned extents Now, if an extent is poisoned we can move it even if there was a checksum error. We'll have to give it a new checksum, but the poison bit means that userspace will still see the appropriate error when they try to read it. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 6 +----- fs/bcachefs/io_read.c | 4 ++++ fs/bcachefs/move.c | 26 ++++++++++++++++++++------ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index e597fb9c9823..a369f978ffe6 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -136,12 +136,8 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return -BCH_ERR_extent_poisoned; - rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 pick_latency; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 3f111f918345..751a9679d7e5 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1053,6 +1053,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bvec_iter_sectors(iter)); goto out_read_done; } + + if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && + !orig->data_update) + return -BCH_ERR_extent_poisoned; retry_pick: ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index dfdbb9259985..fe2fa665150b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -126,26 +126,40 @@ static void move_write_done(struct bch_write_op *op) static void move_write(struct moving_io *io) { + struct bch_fs *c = io->write.op.c; struct moving_context *ctxt = io->write.ctxt; + struct bch_read_bio *rbio = &io->write.rbio; if (ctxt->stats) { - if (io->write.rbio.bio.bi_status) + if (rbio->bio.bi_status) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_uncorrected); - else if (io->write.rbio.saw_error) + else if (rbio->saw_error) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_corrected); } - if (unlikely(io->write.rbio.ret || - io->write.rbio.bio.bi_status || - io->write.data_opts.scrub)) { + /* + * If the extent has been bitrotted, we're going to have to give it a + * new checksum in order to move it - but the poison bit will ensure + * that userspace still gets the appropriate error. + */ + if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && + (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + + rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, + nonce, &rbio->bio); + rbio->ret = 0; + } + + if (unlikely(rbio->ret || io->write.data_opts.scrub)) { move_free(io); return; } if (trace_io_move_write_enabled()) { - struct bch_fs *c = io->write.op.c; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); From 8c087d2ddf5d4d8c07bec96531a5f5629066cd00 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Mar 2025 09:46:06 -0400 Subject: [PATCH 005/218] bcachefs: Rebalance now skips poisoned extents Let's not move poisoned extents unnecessarily, since we can't guard against introducing more bitrot. Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 623273556aa9..3c45500c1a28 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -95,6 +95,9 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | bch2_bkey_ptrs_need_move(c, opts, ptrs); } @@ -107,6 +110,9 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) if (!opts) return 0; + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; From 4e2caf82ce958d1fae96a6d6ba23ea9e80c597b4 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Thu, 27 Mar 2025 14:50:05 +0000 Subject: [PATCH 006/218] bcachefs: replace strncpy() with memcpy_and_pad in journal_transaction_name Strncpy is now deprecated. The buffer destination is not required to be NULL-terminated, but we also want to zero out the rest of the buffer as it is already done in other places. Link: https://github.com/KSPP/linux/issues/90 Signed-off-by: Roxana Nicolescu Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 7d7e52ddde02..4297d8b5eddd 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -20,6 +20,7 @@ #include "snapshot.h" #include +#include static const char * const trans_commit_flags_strs[] = { #define x(n, ...) #n, @@ -366,7 +367,8 @@ static noinline void journal_transaction_name(struct btree_trans *trans) struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); + memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), + trans->fn, strlen(trans->fn), 0); } static inline int btree_key_can_insert(struct btree_trans *trans, From caa6baa45f809bd932362030b16a8bb3e1dae083 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Thu, 27 Mar 2025 14:50:09 +0000 Subject: [PATCH 007/218] bcachefs: replace memcpy with memcpy_and_pad for jset_entry_log->d buff This was achieved before by zero-ing out the source buffer and then copying the bytes into the destination buffer. This can also be done with memcpy_and_pad which will zero out only the destination buffer if its size is bigger than the size of the source buffer. This is already used in the same way in journal_transaction_name(). Moreover, zero-ing the source buffer was done twice, first in __bch2_fs_log_msg() and then in bch2_trans_log_msg(). And this method may also require allocating some extra memory for the source buffer. In conclusion, using memcpy_and_pad is better even tough the result is the same because it brings uniformity with what's already used in journal_transaction_name, it avoids code duplication and reallocating extra memory. Signed-off-by: Roxana Nicolescu Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 1e6b7836cc01..2bffd5121c31 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -14,6 +14,8 @@ #include "snapshot.h" #include "trace.h" +#include + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { @@ -829,7 +831,6 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) { unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); - prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) @@ -842,7 +843,7 @@ int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf->buf, buf->pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), buf->buf, buf->pos, 0); return 0; } @@ -868,7 +869,6 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, prt_vprintf(&buf, fmt, args); unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) @@ -881,7 +881,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf.buf, buf.pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, From d02755b8c5f38e737037e0ff0820eb66ab63c4f5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 1 Apr 2025 14:29:31 -0400 Subject: [PATCH 008/218] bcachefs: trace bch2_trans_kmalloc() We're occasionally seeing the WARN_ON() for bump allocator usage exceeding BTREE_TRANS_MEM_MAX; add some tracing so we can see what's going on. Signed-off-by: Kent Overstreet --- fs/bcachefs/Kconfig | 4 +++ fs/bcachefs/bcachefs.h | 3 ++ fs/bcachefs/btree_iter.c | 51 ++++++++++++++++++++++++++-- fs/bcachefs/btree_iter.h | 68 +++++++++++++++++++++++++++----------- fs/bcachefs/btree_types.h | 9 +++++ fs/bcachefs/btree_update.h | 4 +-- fs/bcachefs/debug.c | 6 ++++ 7 files changed, 121 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 07709b0d7688..a14e4a60b187 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -103,6 +103,10 @@ config BCACHEFS_PATH_TRACEPOINTS Enable extra tracepoints for debugging btree_path operations; we don't normally want these enabled because they happen at very high rates. +config BCACHEFS_TRANS_KMALLOC_TRACE + bool "Trace bch2_trans_kmalloc() calls" + depends on BCACHEFS_FS + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 75f7408da173..24eed2b3be4d 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -650,6 +650,9 @@ struct btree_transaction_stats { unsigned nr_max_paths; unsigned journal_entries_size; unsigned max_mem; +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif char *max_paths_text; }; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index ac5f2046550d..cfd6363dfc39 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3095,7 +3095,19 @@ void bch2_trans_copy_iter(struct btree_trans *trans, dst->key_cache_path = 0; } -void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, + darray_trans_kmalloc_trace *trace) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 60); + + darray_for_each(*trace, i) + prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); +} +#endif + +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) { struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; @@ -3105,14 +3117,35 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) void *new_mem; void *p; - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + struct printbuf buf = PRINTBUF; + bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +#endif + } ret = trans_maybe_inject_restart(trans, _RET_IP_); if (ret) return ERR_PTR(ret); struct btree_transaction_stats *s = btree_trans_stats(trans); - s->max_mem = max(s->max_mem, new_bytes); + if (new_bytes > s->max_mem) { + mutex_lock(&s->lock); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); + s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, + trans->trans_kmalloc_trace.nr); + + memcpy(s->trans_kmalloc_trace.data, + trans->trans_kmalloc_trace.data, + sizeof(s->trans_kmalloc_trace.data[0]) * + s->trans_kmalloc_trace.nr); +#endif + s->max_mem = new_bytes; + mutex_unlock(&s->lock); + } if (trans->used_mempool) { if (trans->mem_bytes >= new_bytes) @@ -3172,6 +3205,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } out_change_top: + bch2_trans_kmalloc_trace(trans, size, ip); + p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3285,6 +3320,10 @@ u32 bch2_trans_begin(struct btree_trans *trans) } #endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + trans->trans_kmalloc_trace.nr = 0; +#endif + trans_set_locked(trans, false); if (trans->restarted) { @@ -3454,6 +3493,9 @@ void bch2_trans_put(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); #endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&trans->trans_kmalloc_trace); +#endif unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; @@ -3608,6 +3650,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&s->trans_kmalloc_trace); +#endif kfree(s->max_paths_text); bch2_time_stats_exit(&s->lock_hold_times); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 9d2cccf5d21a..78a805a89860 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -543,18 +543,46 @@ void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btre void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); -void *__bch2_trans_kmalloc(struct btree_trans *, size_t); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *, + darray_trans_kmalloc_trace *); +#endif -/** - * bch2_trans_kmalloc - allocate memory for use by the current transaction - * - * Must be called after bch2_trans_begin, which on second and further calls - * frees all memory allocated in this transaction - */ -static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); + +static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, + unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_push(&trans->trans_kmalloc_trace, + ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); +#endif +} + +static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, + unsigned long ip) { size = roundup(size, 8); + bch2_trans_kmalloc_trace(trans, size, ip); + + if (likely(trans->mem_top + size <= trans->mem_bytes)) { + void *p = trans->mem + trans->mem_top; + + trans->mem_top += size; + return p; + } else { + return __bch2_trans_kmalloc(trans, size, ip); + } +} + +static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, + unsigned long ip) +{ + size = roundup(size, 8); + + bch2_trans_kmalloc_trace(trans, size, ip); + if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; @@ -562,22 +590,24 @@ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) memset(p, 0, size); return p; } else { - return __bch2_trans_kmalloc(trans, size); + return __bch2_trans_kmalloc(trans, size, ip); } } -static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +/** + * bch2_trans_kmalloc - allocate memory for use by the current transaction + * + * Must be called after bch2_trans_begin, which on second and further calls + * frees all memory allocated in this transaction + */ +static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { - size = round_up(size, 8); + return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); +} - if (likely(trans->mem_top + size <= trans->mem_bytes)) { - void *p = trans->mem + trans->mem_top; - - trans->mem_top += size; - return p; - } else { - return __bch2_trans_kmalloc(trans, size); - } +static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +{ + return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); } static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 023c472dc9ee..81175c1344d2 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -477,6 +477,12 @@ struct btree_trans_paths { struct btree_path paths[]; }; +struct trans_kmalloc_trace { + unsigned long ip; + size_t bytes; +}; +typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; + struct btree_trans { struct bch_fs *c; @@ -488,6 +494,9 @@ struct btree_trans { void *mem; unsigned mem_top; unsigned mem_bytes; +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif btree_path_idx_t nr_sorted; btree_path_idx_t nr_paths; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 568e56c91190..e674419c299e 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -222,7 +222,7 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) trans->extra_disk_res = 0; } -static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, +static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, unsigned type, unsigned min_bytes) { unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); @@ -245,7 +245,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t return mut; } -static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) +static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) { return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 5a8bc7013512..2c52a2c6502b 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -770,6 +770,12 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, mutex_lock(&s->lock); prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + printbuf_indent_add(&i->buf, 2); + bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); + printbuf_indent_sub(&i->buf, 2); +#endif + prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); From ad63f9f1e9a10792847bc46f0226323f238a171d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 20:42:42 -0400 Subject: [PATCH 009/218] bcachefs: struct alloc_request Add a struct for common state for satisfying an on disk allocation, instead of passing the same long list of items to every function. This will help with stack usage, performance, and perhaps enable some code cleanups. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 292 ++++++++++++--------------------- fs/bcachefs/alloc_foreground.h | 25 ++- fs/bcachefs/ec.c | 46 +++--- fs/bcachefs/io_write.h | 28 ---- fs/bcachefs/io_write_types.h | 28 ++++ 5 files changed, 179 insertions(+), 240 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 7ec022e9361a..93c91b5706fb 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -693,24 +693,20 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, } static int add_new_bucket(struct bch_fs *c, - struct open_buckets *ptrs, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - struct open_bucket *ob) + struct alloc_request *req, + struct open_bucket *ob) { unsigned durability = ob_dev(c, ob)->mi.durability; - BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(req->nr_effective >= req->nr_replicas); - __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += durability; - *have_cache |= !durability; + __clear_bit(ob->dev, req->devs_may_alloc.d); + req->nr_effective += durability; + req->have_cache |= !durability; - ob_push(c, ptrs, ob); + ob_push(c, &req->ptrs, ob); - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 1; if (ob->ec) return 1; @@ -718,36 +714,32 @@ static int add_new_bucket(struct bch_fs *c, } int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct open_buckets *ptrs, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_write_flags flags, - enum bch_data_type data_type, - enum bch_watermark watermark, - struct closure *cl) + struct alloc_request *req, + struct dev_stripe_state *stripe, + enum bch_data_type data_type, + struct closure *cl) { struct bch_fs *c = trans->c; int ret = -BCH_ERR_insufficient_devices; - BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(req->nr_effective >= req->nr_replicas); - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc); darray_for_each(devs_sorted, i) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); if (!ca) continue; - if (!ca->mi.durability && *have_cache) { + if (!ca->mi.durability && req->have_cache) { bch2_dev_put(ca); continue; } struct bch_dev_usage usage; - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_alloc_nowait, &usage); + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, + req->watermark, data_type, + cl, req->flags & BCH_WRITE_alloc_nowait, + &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -759,9 +751,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - if (add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob)) { + if (add_new_bucket(c, req, ob)) { ret = 0; break; } @@ -779,34 +769,27 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, */ static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - u16 target, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { struct bch_fs *c = trans->c; int ret = 0; - if (nr_replicas < 2) + if (req->nr_replicas < 2) return 0; - if (ec_open_bucket(c, ptrs)) + if (ec_open_bucket(c, &req->ptrs)) return 0; struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + bch2_ec_stripe_head_get(trans, req->target, 0, req->nr_replicas - 1, req->watermark, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc); darray_for_each(devs_sorted, i) for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) @@ -818,9 +801,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); goto out; } } @@ -832,65 +813,48 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, /* Sector allocator */ static bool want_bucket(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - bool *have_cache, bool ec, + struct alloc_request *req, struct open_bucket *ob) { struct bch_dev *ca = ob_dev(c, ob); - if (!test_bit(ob->dev, devs_may_alloc->d)) + if (!test_bit(ob->dev, req->devs_may_alloc.d)) return false; - if (ob->data_type != wp->data_type) + if (ob->data_type != req->wp->data_type) return false; if (!ca->mi.durability && - (wp->data_type == BCH_DATA_btree || ec || *have_cache)) + (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) return false; - if (ec != (ob->ec != NULL)) + if (req->ec != (ob->ec != NULL)) return false; return true; } static int bucket_alloc_set_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - bool ec) + struct alloc_request *req) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; unsigned i; int ret = 0; - open_bucket_for_each(c, &wp->ptrs, ob, i) { - if (!ret && want_bucket(c, wp, devs_may_alloc, - have_cache, ec, ob)) - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { + if (!ret && want_bucket(c, req, ob)) + ret = add_new_bucket(c, req, ob); else ob_push(c, &ptrs_skip, ob); } - wp->ptrs = ptrs_skip; + req->wp->ptrs = ptrs_skip; return ret; } static int bucket_alloc_set_partial(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, bool ec, - enum bch_watermark watermark) + struct alloc_request *req) { int i, ret = 0; @@ -905,13 +869,13 @@ static int bucket_alloc_set_partial(struct bch_fs *c, for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; - if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { + if (want_bucket(c, req, ob)) { struct bch_dev *ca = ob_dev(c, ob); struct bch_dev_usage usage; u64 avail; bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; + avail = dev_buckets_free(ca, usage, req->watermark) + ca->nr_partial_buckets; if (!avail) continue; @@ -924,9 +888,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; rcu_read_unlock(); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); if (ret) break; } @@ -937,61 +899,42 @@ static int bucket_alloc_set_partial(struct bch_fs *c, } static int __open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - bool erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *_cl) + struct alloc_request *req, + struct closure *_cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; unsigned i; int ret; - devs = target_rw_devs(c, wp->data_type, target); + req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); /* Don't allocate from devices we already have pointers to: */ - darray_for_each(*devs_have, i) - __clear_bit(*i, devs.d); + darray_for_each(*req->devs_have, i) + __clear_bit(*i, req->devs_may_alloc.d); - open_bucket_for_each(c, ptrs, ob, i) - __clear_bit(ob->dev, devs.d); + open_bucket_for_each(c, &req->ptrs, ob, i) + __clear_bit(ob->dev, req->devs_may_alloc.d); - ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code); + ret = bucket_alloc_set_writepoint(c, req); if (ret) return ret; - ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code, watermark); + ret = bucket_alloc_set_partial(c, req); if (ret) return ret; - if (erasure_code) { - ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, - target, - nr_replicas, nr_effective, - have_cache, - watermark, flags, _cl); + if (req->ec) { + ret = bucket_alloc_from_stripe(trans, req, _cl); } else { retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, - nr_replicas, nr_effective, have_cache, - flags, wp->data_type, watermark, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, + req->wp->data_type, cl); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && @@ -1005,38 +948,27 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, } static int open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - unsigned erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { int ret; - if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, erasure_code, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { + ret = __open_bucket_add_buckets(trans, req, cl); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, BCH_ERR_operation_blocked) || bch2_err_matches(ret, BCH_ERR_freelist_empty) || bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) return ret; - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 0; } - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, false, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + bool ec = false; + swap(ec, req->ec); + ret = __open_bucket_add_buckets(trans, req, cl); + swap(ec, req->ec); + return ret < 0 ? ret : 0; } @@ -1327,51 +1259,49 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, struct write_point **wp_ret) { struct bch_fs *c = trans->c; - struct write_point *wp; struct open_bucket *ob; - struct open_buckets ptrs; - unsigned nr_effective, write_points_nr; - bool have_cache; + unsigned write_points_nr; int ret; int i; + struct alloc_request req = { + .nr_replicas = nr_replicas, + .target = target, + .ec = erasure_code, + .watermark = watermark, + .flags = flags, + .devs_have = devs_have, + }; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; BUG_ON(!nr_replicas || !nr_replicas_required); retry: - ptrs.nr = 0; - nr_effective = 0; - write_points_nr = c->write_points_nr; - have_cache = false; + req.ptrs.nr = 0; + req.nr_effective = 0; + req.have_cache = false; + write_points_nr = c->write_points_nr; - *wp_ret = wp = writepoint_find(trans, write_point.v); + *wp_ret = req.wp = writepoint_find(trans, write_point.v); ret = bch2_trans_relock(trans); if (ret) goto err; /* metadata may not allocate on cache devices: */ - if (wp->data_type != BCH_DATA_user) - have_cache = true; + if (req.wp->data_type != BCH_DATA_user) + req.have_cache = true; if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, NULL); + ret = open_bucket_add_buckets(trans, &req, NULL); if (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; /* Don't retry from all devices if we're out of open buckets: */ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + int ret2 = open_bucket_add_buckets(trans, &req, cl); if (!ret2 || bch2_err_matches(ret2, BCH_ERR_transaction_restart) || bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { @@ -1384,45 +1314,39 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, * Only try to allocate cache (durability = 0 devices) from the * specified target: */ - have_cache = true; + req.have_cache = true; + req.target = 0; - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - 0, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, &req, cl); } else { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, &req, cl); } alloc_done: - BUG_ON(!ret && nr_effective < nr_replicas); + BUG_ON(!ret && req.nr_effective < req.nr_replicas); - if (erasure_code && !ec_open_bucket(c, &ptrs)) + if (erasure_code && !ec_open_bucket(c, &req.ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); if (ret == -BCH_ERR_insufficient_devices && - nr_effective >= nr_replicas_required) + req.nr_effective >= nr_replicas_required) ret = 0; if (ret) goto err; - if (nr_effective > nr_replicas) - deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + if (req.nr_effective > req.nr_replicas) + deallocate_extra_replicas(c, &req.ptrs, &req.wp->ptrs, + req.nr_effective - req.nr_replicas); /* Free buckets we didn't use: */ - open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_for_each(c, &req.wp->ptrs, ob, i) open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req.wp->ptrs = req.ptrs; - wp->sectors_free = UINT_MAX; + req.wp->sectors_free = UINT_MAX; - open_bucket_for_each(c, &wp->ptrs, ob, i) { + open_bucket_for_each(c, &req.wp->ptrs, ob, i) { /* * Ensure proper write alignment - either due to misaligned * bucket sizes (from buggy bcachefs-tools), or writes that mix @@ -1436,29 +1360,29 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, ob->sectors_free = max_t(int, 0, ob->sectors_free - align); - wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + req.wp->sectors_free = min(req.wp->sectors_free, ob->sectors_free); } - wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c)); + req.wp->sectors_free = rounddown(req.wp->sectors_free, block_sectors(c)); /* Did alignment use up space in an open_bucket? */ - if (unlikely(!wp->sectors_free)) { - bch2_alloc_sectors_done(c, wp); + if (unlikely(!req.wp->sectors_free)) { + bch2_alloc_sectors_done(c, req.wp); goto retry; } - BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + BUG_ON(!req.wp->sectors_free || req.wp->sectors_free == UINT_MAX); return 0; err: - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ptrs.nr < ARRAY_SIZE(ptrs.v)) - ob_push(c, &ptrs, ob); + open_bucket_for_each(c, &req.wp->ptrs, ob, i) + if (req.ptrs.nr < ARRAY_SIZE(req.ptrs.v)) + ob_push(c, &req.ptrs, ob); else open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req.wp->ptrs = req.ptrs; - mutex_unlock(&wp->lock); + mutex_unlock(&req.wp->lock); if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && try_decrease_writepoints(trans, write_points_nr)) diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 4c1e33cf57c0..874aadf34ebf 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -5,6 +5,7 @@ #include "bcachefs.h" #include "alloc_types.h" #include "extents.h" +#include "io_write_types.h" #include "sb-members.h" #include @@ -23,6 +24,22 @@ struct dev_alloc_list { u8 data[BCH_SB_MEMBERS_MAX]; }; +struct alloc_request { + unsigned nr_replicas; + unsigned target; + bool ec; + enum bch_watermark watermark; + enum bch_write_flags flags; + struct bch_devs_list *devs_have; + + struct write_point *wp; + struct open_buckets ptrs; + unsigned nr_effective; + bool have_cache; + + struct bch_devs_mask devs_may_alloc; +}; + struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct dev_stripe_state *, struct bch_devs_mask *); @@ -173,11 +190,9 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 } enum bch_write_flags; -int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, - struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, enum bch_write_flags, - enum bch_data_type, enum bch_watermark, - struct closure *); +int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, + struct dev_stripe_state *, enum bch_data_type, + struct closure *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c6cb26981923..fc09e0655014 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1714,19 +1714,23 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs = h->devs; struct open_bucket *ob; - struct open_buckets buckets; struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; - bool have_cache = true; int ret = 0; + struct alloc_request req = { + .watermark = watermark, + .devs_may_alloc = h->devs, + .have_cache = true, + }; + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ - bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + bitmap_and(req.devs_may_alloc.d, req.devs_may_alloc.d, + c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { /* @@ -1736,7 +1740,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, * block when updating the stripe */ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, devs.d); + __clear_bit(v->ptrs[i].dev, req.devs_may_alloc.d); if (i < s->nr_data) nr_have_data++; @@ -1747,25 +1751,23 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, BUG_ON(nr_have_data > s->nr_data); BUG_ON(nr_have_parity > s->nr_parity); - buckets.nr = 0; + req.ptrs.nr = 0; if (nr_have_parity < s->nr_parity) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, + req.nr_replicas = s->nr_parity; + req.nr_effective = nr_have_parity; + + ret = bch2_bucket_alloc_set_trans(trans, &req, &h->parity_stripe, - &devs, - s->nr_parity, - &nr_have_parity, - &have_cache, 0, BCH_DATA_parity, - watermark, cl); - open_bucket_for_each(c, &buckets, ob, i) { + open_bucket_for_each(c, &req.ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data + s->nr_parity, s->nr_data); BUG_ON(j >= s->nr_data + s->nr_parity); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req.ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } @@ -1774,24 +1776,22 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, return ret; } - buckets.nr = 0; + req.ptrs.nr = 0; if (nr_have_data < s->nr_data) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, + req.nr_replicas = s->nr_data; + req.nr_effective = nr_have_data; + + ret = bch2_bucket_alloc_set_trans(trans, &req, &h->block_stripe, - &devs, - s->nr_data, - &nr_have_data, - &have_cache, 0, BCH_DATA_user, - watermark, cl); - open_bucket_for_each(c, &buckets, ob, i) { + open_bucket_for_each(c, &req.ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data, 0); BUG_ON(j >= s->nr_data); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req.ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index b8ab19a1e1da..2c0a8f35ee1f 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -17,34 +17,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, __printf(3, 4) void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); -#define BCH_WRITE_FLAGS() \ - x(alloc_nowait) \ - x(cached) \ - x(data_encoded) \ - x(pages_stable) \ - x(pages_owned) \ - x(only_specified_devs) \ - x(wrote_data_inline) \ - x(check_enospc) \ - x(sync) \ - x(move) \ - x(in_worker) \ - x(submitted) \ - x(io_error) \ - x(convert_unwritten) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; - static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->watermark == BCH_WATERMARK_copygc diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index 3ef6df9145ef..b4a6a44a45d0 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -13,6 +13,34 @@ #include #include +#define BCH_WRITE_FLAGS() \ + x(alloc_nowait) \ + x(cached) \ + x(data_encoded) \ + x(pages_stable) \ + x(pages_owned) \ + x(only_specified_devs) \ + x(wrote_data_inline) \ + x(check_enospc) \ + x(sync) \ + x(move) \ + x(in_worker) \ + x(submitted) \ + x(io_error) \ + x(convert_unwritten) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; + struct bch_write_bio { struct_group(wbio, struct bch_fs *c; From 799c41830332121b067c26b43887a95a4f848c22 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 21:06:43 -0400 Subject: [PATCH 010/218] bcachefs: alloc_request.data_type Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 10 +++++----- fs/bcachefs/alloc_foreground.h | 4 ++-- fs/bcachefs/ec.c | 12 ++++-------- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 93c91b5706fb..5cca41b28236 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -716,7 +716,6 @@ static int add_new_bucket(struct bch_fs *c, int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct alloc_request *req, struct dev_stripe_state *stripe, - enum bch_data_type data_type, struct closure *cl) { struct bch_fs *c = trans->c; @@ -737,7 +736,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct bch_dev_usage usage; struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, - req->watermark, data_type, + req->watermark, req->data_type, cl, req->flags & BCH_WRITE_alloc_nowait, &usage); if (!IS_ERR(ob)) @@ -933,8 +932,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, - req->wp->data_type, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && @@ -1285,12 +1283,14 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, *wp_ret = req.wp = writepoint_find(trans, write_point.v); + req.data_type = req.wp->data_type; + ret = bch2_trans_relock(trans); if (ret) goto err; /* metadata may not allocate on cache devices: */ - if (req.wp->data_type != BCH_DATA_user) + if (req.data_type != BCH_DATA_user) req.have_cache = true; if (target && !(flags & BCH_WRITE_only_specified_devs)) { diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 874aadf34ebf..24d6e5863737 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -30,6 +30,7 @@ struct alloc_request { bool ec; enum bch_watermark watermark; enum bch_write_flags flags; + enum bch_data_type data_type; struct bch_devs_list *devs_have; struct write_point *wp; @@ -191,8 +192,7 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 enum bch_write_flags; int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, - struct dev_stripe_state *, enum bch_data_type, - struct closure *); + struct dev_stripe_state *, struct closure *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index fc09e0655014..0865fb8a6f36 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1755,11 +1755,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, if (nr_have_parity < s->nr_parity) { req.nr_replicas = s->nr_parity; req.nr_effective = nr_have_parity; + req.data_type = BCH_DATA_parity; - ret = bch2_bucket_alloc_set_trans(trans, &req, - &h->parity_stripe, - BCH_DATA_parity, - cl); + ret = bch2_bucket_alloc_set_trans(trans, &req, &h->parity_stripe, cl); open_bucket_for_each(c, &req.ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, @@ -1780,11 +1778,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, if (nr_have_data < s->nr_data) { req.nr_replicas = s->nr_data; req.nr_effective = nr_have_data; + req.data_type = BCH_DATA_user; - ret = bch2_bucket_alloc_set_trans(trans, &req, - &h->block_stripe, - BCH_DATA_user, - cl); + ret = bch2_bucket_alloc_set_trans(trans, &req, &h->block_stripe, cl); open_bucket_for_each(c, &req.ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, From 9259883b79e14aaf127c0ef59b5ccca8e04e77ae Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 21 Mar 2025 21:13:53 -0400 Subject: [PATCH 011/218] bcachefs: bch2_bucket_alloc_trans() takes alloc_request Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 37 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 5cca41b28236..ca8df935f198 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -505,24 +505,23 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, - struct closure *cl, - bool nowait, - struct bch_dev_usage *usage) + struct alloc_request *req, + struct bch_dev *ca, + struct closure *cl, + bool nowait, + struct bch_dev_usage *usage) { struct bch_fs *c = trans->c; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; struct bucket_alloc_state s = { - .btree_bitmap = data_type == BCH_DATA_btree, + .btree_bitmap = req->data_type == BCH_DATA_btree, }; bool waiting = nowait; again: bch2_dev_usage_read_fast(ca, usage); - avail = dev_buckets_free(ca, *usage, watermark); + avail = dev_buckets_free(ca, *usage, req->watermark); if (usage->buckets[BCH_DATA_need_discard] > avail) bch2_dev_do_discards(ca); @@ -534,7 +533,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, bch2_dev_do_invalidates(ca); if (!avail) { - if (watermark > BCH_WATERMARK_normal && + if (req->watermark > BCH_WATERMARK_normal && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) goto alloc; @@ -554,8 +553,8 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, closure_wake_up(&c->freelist_wait); alloc: ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) - : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); + ? bch2_bucket_alloc_freelist(trans, ca, req->watermark, &s, cl) + : bch2_bucket_alloc_early(trans, ca, req->watermark, &s, cl); if (s.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); @@ -574,7 +573,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) - ob->data_type = data_type; + ob->data_type = req->data_type; if (!IS_ERR(ob)) count_event(c, bucket_alloc); @@ -584,7 +583,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); + trace_bucket_alloc2(c, ca, req->watermark, req->data_type, cl, usage, &s, ob); return ob; } @@ -596,10 +595,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, { struct bch_dev_usage usage; struct open_bucket *ob; + struct alloc_request req = { + .watermark = watermark, + .data_type = data_type, + }; bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - data_type, cl, false, &usage))); + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, ca, cl, false, &usage))); return ob; } @@ -735,9 +737,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, } struct bch_dev_usage usage; - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, - req->watermark, req->data_type, - cl, req->flags & BCH_WRITE_alloc_nowait, + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, ca, cl, + req->flags & BCH_WRITE_alloc_nowait, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); From 7100344301d80f09ea64c37c35ea10163d35d433 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Mar 2025 15:37:28 -0400 Subject: [PATCH 012/218] bcachefs: bch2_ec_stripe_head_get() takes alloc_request Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 5 ++--- fs/bcachefs/alloc_foreground.h | 7 +++---- fs/bcachefs/ec.c | 20 ++++++++++---------- fs/bcachefs/ec.h | 5 +++-- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index ca8df935f198..aef27d40d354 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -495,9 +495,8 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object + * @req: state for the entire allocation * @ca: device to allocate from - * @watermark: how important is this allocation? - * @data_type: BCH_DATA_journal, btree, user... * @cl: if not NULL, closure to be used to wait if buckets not available * @nowait: if true, do not wait for buckets to become available * @usage: for secondarily also returning the current device usage @@ -782,7 +781,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, return 0; struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, req->target, 0, req->nr_replicas - 1, req->watermark, cl); + bch2_ec_stripe_head_get(trans, req, 0, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 24d6e5863737..27219cd1368f 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -32,12 +32,11 @@ struct alloc_request { enum bch_write_flags flags; enum bch_data_type data_type; struct bch_devs_list *devs_have; - struct write_point *wp; - struct open_buckets ptrs; - unsigned nr_effective; - bool have_cache; + struct open_buckets ptrs; + unsigned nr_effective; /* sum of @ptrs durability */ + bool have_cache; /* have we allocated from a 0 durability dev */ struct bch_devs_mask devs_may_alloc; }; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 0865fb8a6f36..6f977e134d08 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1977,17 +1977,15 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st } struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned target, + struct alloc_request *req, unsigned algo, - unsigned redundancy, - enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; - struct ec_stripe_head *h; - bool waiting = false; + unsigned redundancy = req->nr_replicas - 1; unsigned disk_label = 0; - struct target t = target_decode(target); + struct target t = target_decode(req->target); + bool waiting = false; int ret; if (t.type == TARGET_GROUP) { @@ -1998,7 +1996,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, disk_label = t.group + 1; /* 0 == no label */ } - h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); + struct ec_stripe_head *h = + __bch2_ec_stripe_head_get(trans, disk_label, algo, + redundancy, req->watermark); if (IS_ERR_OR_NULL(h)) return h; @@ -2041,8 +2041,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; - if (watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: + if (req->watermark == BCH_WATERMARK_copygc) { + ret = new_stripe_alloc_buckets(trans, h, s, req->watermark, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; @@ -2061,7 +2061,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); + ret = new_stripe_alloc_buckets(trans, h, s, req->watermark, cl); if (ret) goto err; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 51893e1ee874..83d37bcb548a 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -255,9 +255,10 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); + +struct alloc_request; struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, - unsigned, unsigned, unsigned, - enum bch_watermark, struct closure *); + struct alloc_request *, unsigned, struct closure *); void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); From ac0952b0e50934a15fdd67a2ff376e6ab8152c39 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Mar 2025 15:46:45 -0400 Subject: [PATCH 013/218] bcachefs: new_stripe_alloc_buckets() takes alloc_request More stack usage improvements: instead of creating a new alloc_request (currently on the stack), save/restore just the fields we need to reuse. This is a bit tricky, because we're doing a normal alloc_foreground.c allocation, which calls into ec.c to get a stripe, which then does more normal allocations - some of the fields get reused, and used differently. So we have to save and restore them - but the stack usage improvements will be well worth it. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 73 +++++++++++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 6f977e134d08..11f46dccc14f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1710,8 +1710,9 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, } static int new_stripe_alloc_buckets(struct btree_trans *trans, + struct alloc_request *req, struct ec_stripe_head *h, struct ec_stripe_new *s, - enum bch_watermark watermark, struct closure *cl) + struct closure *cl) { struct bch_fs *c = trans->c; struct open_bucket *ob; @@ -1719,17 +1720,21 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, unsigned i, j, nr_have_parity = 0, nr_have_data = 0; int ret = 0; - struct alloc_request req = { - .watermark = watermark, - .devs_may_alloc = h->devs, - .have_cache = true, - }; + enum bch_data_type saved_data_type = req->data_type; + struct open_buckets saved_ptrs = req->ptrs; + unsigned saved_nr_replicas = req->nr_replicas; + unsigned saved_nr_effective = req->nr_effective; + bool saved_have_cache = req->have_cache; + struct bch_devs_mask saved_devs_may_alloc = req->devs_may_alloc; + + req->devs_may_alloc = h->devs; + req->have_cache = true; BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ - bitmap_and(req.devs_may_alloc.d, req.devs_may_alloc.d, + bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { @@ -1740,7 +1745,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, * block when updating the stripe */ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, req.devs_may_alloc.d); + __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); if (i < s->nr_data) nr_have_data++; @@ -1751,52 +1756,58 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, BUG_ON(nr_have_data > s->nr_data); BUG_ON(nr_have_parity > s->nr_parity); - req.ptrs.nr = 0; + req->ptrs.nr = 0; if (nr_have_parity < s->nr_parity) { - req.nr_replicas = s->nr_parity; - req.nr_effective = nr_have_parity; - req.data_type = BCH_DATA_parity; + req->nr_replicas = s->nr_parity; + req->nr_effective = nr_have_parity; + req->data_type = BCH_DATA_parity; - ret = bch2_bucket_alloc_set_trans(trans, &req, &h->parity_stripe, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); - open_bucket_for_each(c, &req.ptrs, ob, i) { + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data + s->nr_parity, s->nr_data); BUG_ON(j >= s->nr_data + s->nr_parity); - s->blocks[j] = req.ptrs.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - req.ptrs.nr = 0; + req->ptrs.nr = 0; if (nr_have_data < s->nr_data) { - req.nr_replicas = s->nr_data; - req.nr_effective = nr_have_data; - req.data_type = BCH_DATA_user; + req->nr_replicas = s->nr_data; + req->nr_effective = nr_have_data; + req->data_type = BCH_DATA_user; - ret = bch2_bucket_alloc_set_trans(trans, &req, &h->block_stripe, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); - open_bucket_for_each(c, &req.ptrs, ob, i) { + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data, 0); BUG_ON(j >= s->nr_data); - s->blocks[j] = req.ptrs.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - - return 0; +err: + req->data_type = saved_data_type; + req->ptrs = saved_ptrs; + req->nr_replicas = saved_nr_replicas; + req->nr_effective = saved_nr_effective; + req->have_cache = saved_have_cache; + req->devs_may_alloc = saved_devs_may_alloc; + return ret; } static int __get_existing_stripe(struct btree_trans *trans, @@ -2022,8 +2033,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, goto alloc_existing; /* First, try to allocate a full stripe: */ - ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: + enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; + swap(req->watermark, saved_watermark); + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); + swap(req->watermark, saved_watermark); + if (!ret) goto allocate_buf; if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || @@ -2042,7 +2057,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, goto err; if (req->watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, s, req->watermark, NULL) ?: + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; @@ -2061,7 +2076,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, s, req->watermark, cl); + ret = new_stripe_alloc_buckets(trans, req, h, s, cl); if (ret) goto err; From a0312f425177ce7d70da10b726ba2c7b133089f1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Mar 2025 17:54:43 -0400 Subject: [PATCH 014/218] bcachefs: alloc_request: deallocate_extra_replicas() Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index aef27d40d354..1ed54abc3760 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1219,26 +1219,25 @@ static struct write_point *writepoint_find(struct btree_trans *trans, static noinline void deallocate_extra_replicas(struct bch_fs *c, - struct open_buckets *ptrs, - struct open_buckets *ptrs_no_use, - unsigned extra_replicas) + struct alloc_request *req) { struct open_buckets ptrs2 = { 0 }; struct open_bucket *ob; + unsigned extra_replicas = req->nr_effective - req->nr_replicas; unsigned i; - open_bucket_for_each(c, ptrs, ob, i) { + open_bucket_for_each(c, &req->ptrs, ob, i) { unsigned d = ob_dev(c, ob)->mi.durability; if (d && d <= extra_replicas) { extra_replicas -= d; - ob_push(c, ptrs_no_use, ob); + ob_push(c, &req->wp->ptrs, ob); } else { ob_push(c, &ptrs2, ob); } } - *ptrs = ptrs2; + req->ptrs = ptrs2; } /* @@ -1335,8 +1334,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, goto err; if (req.nr_effective > req.nr_replicas) - deallocate_extra_replicas(c, &req.ptrs, &req.wp->ptrs, - req.nr_effective - req.nr_replicas); + deallocate_extra_replicas(c, &req); /* Free buckets we didn't use: */ open_bucket_for_each(c, &req.wp->ptrs, ob, i) From 4d00e88d21d6369a2c5ff0fbdec82d418ef10230 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Mar 2025 15:52:39 -0400 Subject: [PATCH 015/218] bcachefs: alloc_request.usage Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 42 ++++++++++++++-------------------- fs/bcachefs/alloc_foreground.h | 4 ++++ 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 1ed54abc3760..842443133866 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -454,10 +454,8 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, } static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, + struct alloc_request *req, struct closure *cl, - struct bch_dev_usage *usage, struct bucket_alloc_state *s, struct open_bucket *ob) { @@ -466,11 +464,11 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, printbuf_tabstop_push(&buf, 24); prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); - prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); - prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); + prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); + prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); prt_printf(&buf, "blocking\t%u\n", cl != NULL); - prt_printf(&buf, "free\t%llu\n", usage->buckets[BCH_DATA_free]); - prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); + prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); + prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, req->usage, req->watermark)); prt_printf(&buf, "copygc_wait\t%lu/%lli\n", bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); @@ -499,7 +497,6 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, * @ca: device to allocate from * @cl: if not NULL, closure to be used to wait if buckets not available * @nowait: if true, do not wait for buckets to become available - * @usage: for secondarily also returning the current device usage * * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ @@ -507,8 +504,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct alloc_request *req, struct bch_dev *ca, struct closure *cl, - bool nowait, - struct bch_dev_usage *usage) + bool nowait) { struct bch_fs *c = trans->c; struct open_bucket *ob = NULL; @@ -519,16 +515,16 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, }; bool waiting = nowait; again: - bch2_dev_usage_read_fast(ca, usage); - avail = dev_buckets_free(ca, *usage, req->watermark); + bch2_dev_usage_read_fast(ca, &req->usage); + avail = dev_buckets_free(ca, req->usage, req->watermark); - if (usage->buckets[BCH_DATA_need_discard] > avail) + if (req->usage.buckets[BCH_DATA_need_discard] > avail) bch2_dev_do_discards(ca); - if (usage->buckets[BCH_DATA_need_gc_gens] > avail) + if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) bch2_gc_gens_async(c); - if (should_invalidate_buckets(ca, *usage)) + if (should_invalidate_buckets(ca, req->usage)) bch2_dev_do_invalidates(ca); if (!avail) { @@ -582,7 +578,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, ca, req->watermark, req->data_type, cl, usage, &s, ob); + trace_bucket_alloc2(c, ca, req, cl, &s, ob); return ob; } @@ -592,7 +588,6 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum bch_data_type data_type, struct closure *cl) { - struct bch_dev_usage usage; struct open_bucket *ob; struct alloc_request req = { .watermark = watermark, @@ -600,7 +595,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, }; bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, ca, cl, false, &usage))); + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, ca, cl, false))); return ob; } @@ -735,12 +730,10 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - struct bch_dev_usage usage; struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, ca, cl, - req->flags & BCH_WRITE_alloc_nowait, - &usage); + req->flags & BCH_WRITE_alloc_nowait); if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(ca, stripe, &usage); + bch2_dev_stripe_increment_inlined(ca, stripe, &req->usage); bch2_dev_put(ca); if (IS_ERR(ob)) { @@ -870,11 +863,10 @@ static int bucket_alloc_set_partial(struct bch_fs *c, if (want_bucket(c, req, ob)) { struct bch_dev *ca = ob_dev(c, ob); - struct bch_dev_usage usage; u64 avail; - bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, req->watermark) + ca->nr_partial_buckets; + bch2_dev_usage_read_fast(ca, &req->usage); + avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; if (!avail) continue; diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 27219cd1368f..ffee1b8894f0 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -34,10 +34,14 @@ struct alloc_request { struct bch_devs_list *devs_have; struct write_point *wp; + /* These fields are used primarily by open_bucket_add_buckets */ struct open_buckets ptrs; unsigned nr_effective; /* sum of @ptrs durability */ bool have_cache; /* have we allocated from a 0 durability dev */ struct bch_devs_mask devs_may_alloc; + + /* bch2_bucket_alloc_set_trans(): */ + struct bch_dev_usage usage; }; struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, From 7f65d1cf5c30cf3e634cd8a02ea8563f7af53e89 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Mar 2025 17:08:43 -0400 Subject: [PATCH 016/218] bcachefs: alloc_request.counters Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 98 ++++++++++++++++------------------ fs/bcachefs/alloc_foreground.h | 19 ++++++- fs/bcachefs/alloc_types.h | 16 ------ 3 files changed, 64 insertions(+), 69 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 842443133866..3712e8722f3d 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -180,11 +180,11 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) } static inline bool may_alloc_bucket(struct bch_fs *c, - struct bpos bucket, - struct bucket_alloc_state *s) + struct alloc_request *req, + struct bpos bucket) { if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { - s->skipped_open++; + req->counters.skipped_open++; return false; } @@ -193,13 +193,13 @@ static inline bool may_alloc_bucket(struct bch_fs *c, bucket.inode, bucket.offset); if (journal_seq_ready > c->journal.flushed_seq_ondisk) { if (journal_seq_ready > c->journal.flushing_seq) - s->need_journal_commit++; - s->skipped_need_journal_commit++; + req->counters.need_journal_commit++; + req->counters.skipped_need_journal_commit++; return false; } if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { - s->skipped_nocow++; + req->counters.skipped_nocow++; return false; } @@ -207,22 +207,21 @@ static inline bool may_alloc_bucket(struct bch_fs *c, } static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + struct alloc_request *req, u64 bucket, u8 gen, - enum bch_watermark watermark, - struct bucket_alloc_state *s, struct closure *cl) { if (unlikely(is_superblock_bucket(c, ca, bucket))) return NULL; if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - s->skipped_nouse++; + req->counters.skipped_nouse++; return NULL; } spin_lock(&c->freelist_lock); - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -234,7 +233,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * /* Recheck under lock: */ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { spin_unlock(&c->freelist_lock); - s->skipped_open++; + req->counters.skipped_open++; return NULL; } @@ -259,15 +258,14 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * } static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, + struct alloc_request *req, struct btree_iter *freespace_iter, struct closure *cl) { struct bch_fs *c = trans->c; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) + if (!may_alloc_bucket(c, req, POS(ca->dev_idx, b))) return NULL; u8 gen; @@ -277,7 +275,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc if (ret) return NULL; - return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); + return __try_alloc_bucket(c, ca, req, b, gen, cl); } /* @@ -286,8 +284,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, + struct alloc_request *req, struct closure *cl) { struct bch_fs *c = trans->c; @@ -295,7 +292,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bkey_s_c k, ck; struct open_bucket *ob = NULL; u64 first_bucket = ca->mi.first_bucket; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; int ret; @@ -317,10 +314,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans, if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + if (req->btree_bitmap != BTREE_BITMAP_ANY && + req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (s->btree_bitmap == BTREE_BITMAP_YES && + if (req->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) break; @@ -328,8 +325,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans, round_up(bucket_to_sector(ca, bucket) + 1, 1ULL << ca->mi.btree_bitmap_shift)); bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); - s->buckets_seen++; - s->skipped_mi_btree_bitmap++; + req->counters.buckets_seen++; + req->counters.skipped_mi_btree_bitmap++; continue; } @@ -348,11 +345,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans, if (a->data_type != BCH_DATA_free) goto next; - s->buckets_seen++; + req->counters.buckets_seen++; - ob = may_alloc_bucket(c, k.k->p, s) - ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, - watermark, s, cl) + ob = may_alloc_bucket(c, req, k.k->p) + ? __try_alloc_bucket(c, ca, req, k.k->p.offset, a->gen, cl) : NULL; next: bch2_set_btree_iter_dontneed(trans, &citer); @@ -379,14 +375,13 @@ bch2_bucket_alloc_early(struct btree_trans *trans, static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, + struct alloc_request *req, struct closure *cl) { struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; @@ -402,13 +397,13 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, iter.k.size = iter.k.p.offset - iter.pos.offset; while (iter.k.size) { - s->buckets_seen++; + req->counters.buckets_seen++; u64 bucket = iter.pos.offset & ~(~0ULL << 56); - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + if (req->btree_bitmap != BTREE_BITMAP_ANY && + req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (s->btree_bitmap == BTREE_BITMAP_YES && + if (req->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) goto fail; @@ -418,11 +413,11 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); - s->skipped_mi_btree_bitmap++; + req->counters.skipped_mi_btree_bitmap++; goto next; } - ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); + ob = try_alloc_bucket(trans, ca, req, &iter, cl); if (ob) { if (!IS_ERR(ob)) *dev_alloc_cursor = iter.pos.offset; @@ -456,7 +451,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, struct alloc_request *req, struct closure *cl, - struct bucket_alloc_state *s, struct open_bucket *ob) { struct printbuf buf = PRINTBUF; @@ -472,12 +466,12 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, prt_printf(&buf, "copygc_wait\t%lu/%lli\n", bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); - prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); - prt_printf(&buf, "open\t%llu\n", s->skipped_open); - prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); - prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); - prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); - prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); + prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); + prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); + prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); + prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); + prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); + prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); if (!IS_ERR(ob)) { prt_printf(&buf, "allocated\t%llu\n", ob->bucket); @@ -510,10 +504,10 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; - struct bucket_alloc_state s = { - .btree_bitmap = req->data_type == BCH_DATA_btree, - }; bool waiting = nowait; + + req->btree_bitmap = req->data_type == BCH_DATA_btree; + memset(&req->counters, 0, sizeof(req->counters)); again: bch2_dev_usage_read_fast(ca, &req->usage); avail = dev_buckets_free(ca, req->usage, req->watermark); @@ -548,14 +542,14 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, closure_wake_up(&c->freelist_wait); alloc: ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, ca, req->watermark, &s, cl) - : bch2_bucket_alloc_early(trans, ca, req->watermark, &s, cl); + ? bch2_bucket_alloc_freelist(trans, ca, req, cl) + : bch2_bucket_alloc_early(trans, ca, req, cl); - if (s.need_journal_commit * 2 > avail) + if (req->counters.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); - if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { - s.btree_bitmap = BTREE_BITMAP_ANY; + if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { + req->btree_bitmap = BTREE_BITMAP_ANY; goto alloc; } @@ -578,7 +572,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, ca, req, cl, &s, ob); + trace_bucket_alloc2(c, ca, req, cl, ob); return ob; } diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index ffee1b8894f0..5d311a41d65f 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -41,7 +41,24 @@ struct alloc_request { struct bch_devs_mask devs_may_alloc; /* bch2_bucket_alloc_set_trans(): */ - struct bch_dev_usage usage; + struct bch_dev_usage usage; + + /* bch2_bucket_alloc_trans(): */ + enum { + BTREE_BITMAP_NO, + BTREE_BITMAP_YES, + BTREE_BITMAP_ANY, + } btree_bitmap; + + struct { + u64 buckets_seen; + u64 skipped_open; + u64 skipped_need_journal_commit; + u64 need_journal_commit; + u64 skipped_nocow; + u64 skipped_nouse; + u64 skipped_mi_btree_bitmap; + } counters; }; struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 8f79f46c2a78..e7becdf22cba 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -8,22 +8,6 @@ #include "clock_types.h" #include "fifo.h" -struct bucket_alloc_state { - enum { - BTREE_BITMAP_NO, - BTREE_BITMAP_YES, - BTREE_BITMAP_ANY, - } btree_bitmap; - - u64 buckets_seen; - u64 skipped_open; - u64 skipped_need_journal_commit; - u64 need_journal_commit; - u64 skipped_nocow; - u64 skipped_nouse; - u64 skipped_mi_btree_bitmap; -}; - #define BCH_WATERMARKS() \ x(stripe) \ x(normal) \ From e038213658f09bc775a67cf8e18f0aec4b0f7679 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Mar 2025 17:13:22 -0400 Subject: [PATCH 017/218] bcachefs: alloc_request.ca Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 54 ++++++++++++++++++---------------- fs/bcachefs/alloc_foreground.h | 2 ++ 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 3712e8722f3d..642d22643558 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -206,11 +206,13 @@ static inline bool may_alloc_bucket(struct bch_fs *c, return true; } -static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct alloc_request *req, u64 bucket, u8 gen, struct closure *cl) { + struct bch_dev *ca = req->ca; + if (unlikely(is_superblock_bucket(c, ca, bucket))) return NULL; @@ -257,7 +259,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return ob; } -static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct alloc_request *req, struct btree_iter *freespace_iter, struct closure *cl) @@ -265,7 +267,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc struct bch_fs *c = trans->c; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - if (!may_alloc_bucket(c, req, POS(ca->dev_idx, b))) + if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) return NULL; u8 gen; @@ -275,7 +277,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc if (ret) return NULL; - return __try_alloc_bucket(c, ca, req, b, gen, cl); + return __try_alloc_bucket(c, req, b, gen, cl); } /* @@ -283,11 +285,11 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, - struct bch_dev *ca, struct alloc_request *req, struct closure *cl) { struct bch_fs *c = trans->c; + struct bch_dev *ca = req->ca; struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; @@ -348,7 +350,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans, req->counters.buckets_seen++; ob = may_alloc_bucket(c, req, k.k->p) - ? __try_alloc_bucket(c, ca, req, k.k->p.offset, a->gen, cl) + ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) : NULL; next: bch2_set_btree_iter_dontneed(trans, &citer); @@ -374,10 +376,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans, } static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - struct bch_dev *ca, - struct alloc_request *req, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { + struct bch_dev *ca = req->ca; struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; @@ -417,7 +419,7 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, goto next; } - ob = try_alloc_bucket(trans, ca, req, &iter, cl); + ob = try_alloc_bucket(trans, req, &iter, cl); if (ob) { if (!IS_ERR(ob)) *dev_alloc_cursor = iter.pos.offset; @@ -448,7 +450,7 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, return ob; } -static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, +static noinline void trace_bucket_alloc2(struct bch_fs *c, struct alloc_request *req, struct closure *cl, struct open_bucket *ob) @@ -457,12 +459,12 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, printbuf_tabstop_push(&buf, 24); - prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); + prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); prt_printf(&buf, "blocking\t%u\n", cl != NULL); prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); - prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, req->usage, req->watermark)); + prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); prt_printf(&buf, "copygc_wait\t%lu/%lli\n", bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); @@ -488,7 +490,6 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object * @req: state for the entire allocation - * @ca: device to allocate from * @cl: if not NULL, closure to be used to wait if buckets not available * @nowait: if true, do not wait for buckets to become available * @@ -496,11 +497,11 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct alloc_request *req, - struct bch_dev *ca, struct closure *cl, bool nowait) { struct bch_fs *c = trans->c; + struct bch_dev *ca = req->ca; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; @@ -542,8 +543,8 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, closure_wake_up(&c->freelist_wait); alloc: ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, ca, req, cl) - : bch2_bucket_alloc_early(trans, ca, req, cl); + ? bch2_bucket_alloc_freelist(trans, req, cl) + : bch2_bucket_alloc_early(trans, req, cl); if (req->counters.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); @@ -572,7 +573,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, ca, req, cl, ob); + trace_bucket_alloc2(c, req, cl, ob); return ob; } @@ -586,10 +587,11 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, struct alloc_request req = { .watermark = watermark, .data_type = data_type, + .ca = ca, }; bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, ca, cl, false))); + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); return ob; } @@ -715,20 +717,20 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc); darray_for_each(devs_sorted, i) { - struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); - if (!ca) + req->ca = bch2_dev_tryget_noerror(c, *i); + if (!req->ca) continue; - if (!ca->mi.durability && req->have_cache) { - bch2_dev_put(ca); + if (!req->ca->mi.durability && req->have_cache) { + bch2_dev_put(req->ca); continue; } - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, ca, cl, + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, req->flags & BCH_WRITE_alloc_nowait); if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(ca, stripe, &req->usage); - bch2_dev_put(ca); + bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); + bch2_dev_put(req->ca); if (IS_ERR(ob)) { ret = PTR_ERR(ob); diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 5d311a41d65f..7117e1e5c6d9 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -44,6 +44,8 @@ struct alloc_request { struct bch_dev_usage usage; /* bch2_bucket_alloc_trans(): */ + struct bch_dev *ca; + enum { BTREE_BITMAP_NO, BTREE_BITMAP_YES, From 95f2315af7536c220301421eff6291c80ec321e5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Mar 2025 17:57:06 -0400 Subject: [PATCH 018/218] bcachefs: alloc_request.ptrs2 Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 14 ++++++++------ fs/bcachefs/alloc_foreground.h | 1 + 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 642d22643558..f546b4fcd58f 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -825,18 +825,19 @@ static bool want_bucket(struct bch_fs *c, static int bucket_alloc_set_writepoint(struct bch_fs *c, struct alloc_request *req) { - struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; unsigned i; int ret = 0; + req->ptrs2.nr = 0; + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { if (!ret && want_bucket(c, req, ob)) ret = add_new_bucket(c, req, ob); else - ob_push(c, &ptrs_skip, ob); + ob_push(c, &req->ptrs2, ob); } - req->wp->ptrs = ptrs_skip; + req->wp->ptrs = req->ptrs2; return ret; } @@ -1209,11 +1210,12 @@ static noinline void deallocate_extra_replicas(struct bch_fs *c, struct alloc_request *req) { - struct open_buckets ptrs2 = { 0 }; struct open_bucket *ob; unsigned extra_replicas = req->nr_effective - req->nr_replicas; unsigned i; + req->ptrs2.nr = 0; + open_bucket_for_each(c, &req->ptrs, ob, i) { unsigned d = ob_dev(c, ob)->mi.durability; @@ -1221,11 +1223,11 @@ deallocate_extra_replicas(struct bch_fs *c, extra_replicas -= d; ob_push(c, &req->wp->ptrs, ob); } else { - ob_push(c, &ptrs2, ob); + ob_push(c, &req->ptrs2, ob); } } - req->ptrs = ptrs2; + req->ptrs = req->ptrs2; } /* diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 7117e1e5c6d9..ae8ca3b7786b 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -36,6 +36,7 @@ struct alloc_request { /* These fields are used primarily by open_bucket_add_buckets */ struct open_buckets ptrs; + struct open_buckets ptrs2; unsigned nr_effective; /* sum of @ptrs durability */ bool have_cache; /* have we allocated from a 0 durability dev */ struct bch_devs_mask devs_may_alloc; From a0b0b9bb9e3cb896a5585701fa6b340d2ada6f63 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Mar 2025 17:50:52 -0400 Subject: [PATCH 019/218] bcachefs: alloc_request no longer on stack Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 84 +++++++++++++++++----------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index f546b4fcd58f..f68e5f6849b0 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1248,49 +1248,51 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, struct bch_fs *c = trans->c; struct open_bucket *ob; unsigned write_points_nr; - int ret; int i; - struct alloc_request req = { - .nr_replicas = nr_replicas, - .target = target, - .ec = erasure_code, - .watermark = watermark, - .flags = flags, - .devs_have = devs_have, - }; + struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); + int ret = PTR_ERR_OR_ZERO(req); + if (unlikely(ret)) + return ret; + + req->nr_replicas = nr_replicas; + req->target = target; + req->ec = erasure_code; + req->watermark = watermark; + req->flags = flags; + req->devs_have = devs_have; if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; BUG_ON(!nr_replicas || !nr_replicas_required); retry: - req.ptrs.nr = 0; - req.nr_effective = 0; - req.have_cache = false; + req->ptrs.nr = 0; + req->nr_effective = 0; + req->have_cache = false; write_points_nr = c->write_points_nr; - *wp_ret = req.wp = writepoint_find(trans, write_point.v); + *wp_ret = req->wp = writepoint_find(trans, write_point.v); - req.data_type = req.wp->data_type; + req->data_type = req->wp->data_type; ret = bch2_trans_relock(trans); if (ret) goto err; /* metadata may not allocate on cache devices: */ - if (req.data_type != BCH_DATA_user) - req.have_cache = true; + if (req->data_type != BCH_DATA_user) + req->have_cache = true; if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, &req, NULL); + ret = open_bucket_add_buckets(trans, req, NULL); if (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; /* Don't retry from all devices if we're out of open buckets: */ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, &req, cl); + int ret2 = open_bucket_add_buckets(trans, req, cl); if (!ret2 || bch2_err_matches(ret2, BCH_ERR_transaction_restart) || bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { @@ -1303,38 +1305,38 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, * Only try to allocate cache (durability = 0 devices) from the * specified target: */ - req.have_cache = true; - req.target = 0; + req->have_cache = true; + req->target = 0; - ret = open_bucket_add_buckets(trans, &req, cl); + ret = open_bucket_add_buckets(trans, req, cl); } else { - ret = open_bucket_add_buckets(trans, &req, cl); + ret = open_bucket_add_buckets(trans, req, cl); } alloc_done: - BUG_ON(!ret && req.nr_effective < req.nr_replicas); + BUG_ON(!ret && req->nr_effective < req->nr_replicas); - if (erasure_code && !ec_open_bucket(c, &req.ptrs)) + if (erasure_code && !ec_open_bucket(c, &req->ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); if (ret == -BCH_ERR_insufficient_devices && - req.nr_effective >= nr_replicas_required) + req->nr_effective >= nr_replicas_required) ret = 0; if (ret) goto err; - if (req.nr_effective > req.nr_replicas) - deallocate_extra_replicas(c, &req); + if (req->nr_effective > req->nr_replicas) + deallocate_extra_replicas(c, req); /* Free buckets we didn't use: */ - open_bucket_for_each(c, &req.wp->ptrs, ob, i) + open_bucket_for_each(c, &req->wp->ptrs, ob, i) open_bucket_free_unused(c, ob); - req.wp->ptrs = req.ptrs; + req->wp->ptrs = req->ptrs; - req.wp->sectors_free = UINT_MAX; + req->wp->sectors_free = UINT_MAX; - open_bucket_for_each(c, &req.wp->ptrs, ob, i) { + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { /* * Ensure proper write alignment - either due to misaligned * bucket sizes (from buggy bcachefs-tools), or writes that mix @@ -1348,29 +1350,29 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, ob->sectors_free = max_t(int, 0, ob->sectors_free - align); - req.wp->sectors_free = min(req.wp->sectors_free, ob->sectors_free); + req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); } - req.wp->sectors_free = rounddown(req.wp->sectors_free, block_sectors(c)); + req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c)); /* Did alignment use up space in an open_bucket? */ - if (unlikely(!req.wp->sectors_free)) { - bch2_alloc_sectors_done(c, req.wp); + if (unlikely(!req->wp->sectors_free)) { + bch2_alloc_sectors_done(c, req->wp); goto retry; } - BUG_ON(!req.wp->sectors_free || req.wp->sectors_free == UINT_MAX); + BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); return 0; err: - open_bucket_for_each(c, &req.wp->ptrs, ob, i) - if (req.ptrs.nr < ARRAY_SIZE(req.ptrs.v)) - ob_push(c, &req.ptrs, ob); + open_bucket_for_each(c, &req->wp->ptrs, ob, i) + if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) + ob_push(c, &req->ptrs, ob); else open_bucket_free_unused(c, ob); - req.wp->ptrs = req.ptrs; + req->wp->ptrs = req->ptrs; - mutex_unlock(&req.wp->lock); + mutex_unlock(&req->wp->lock); if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && try_decrease_writepoints(trans, write_points_nr)) From 2a81bd454c45c89b167b6c2bd3ba7b5a489b0830 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Apr 2025 17:23:22 -0400 Subject: [PATCH 020/218] bcachefs: reduce new_stripe_alloc_buckets() stack usage Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 12 ++++++------ fs/bcachefs/alloc_foreground.h | 8 +++++++- fs/bcachefs/ec.c | 24 ++++++++++++------------ 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index f68e5f6849b0..31d2207a071b 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -829,15 +829,15 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c, unsigned i; int ret = 0; - req->ptrs2.nr = 0; + req->scratch_ptrs.nr = 0; open_bucket_for_each(c, &req->wp->ptrs, ob, i) { if (!ret && want_bucket(c, req, ob)) ret = add_new_bucket(c, req, ob); else - ob_push(c, &req->ptrs2, ob); + ob_push(c, &req->scratch_ptrs, ob); } - req->wp->ptrs = req->ptrs2; + req->wp->ptrs = req->scratch_ptrs; return ret; } @@ -1214,7 +1214,7 @@ deallocate_extra_replicas(struct bch_fs *c, unsigned extra_replicas = req->nr_effective - req->nr_replicas; unsigned i; - req->ptrs2.nr = 0; + req->scratch_ptrs.nr = 0; open_bucket_for_each(c, &req->ptrs, ob, i) { unsigned d = ob_dev(c, ob)->mi.durability; @@ -1223,11 +1223,11 @@ deallocate_extra_replicas(struct bch_fs *c, extra_replicas -= d; ob_push(c, &req->wp->ptrs, ob); } else { - ob_push(c, &req->ptrs2, ob); + ob_push(c, &req->scratch_ptrs, ob); } } - req->ptrs = req->ptrs2; + req->ptrs = req->scratch_ptrs; } /* diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index ae8ca3b7786b..192203410d4e 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -36,7 +36,6 @@ struct alloc_request { /* These fields are used primarily by open_bucket_add_buckets */ struct open_buckets ptrs; - struct open_buckets ptrs2; unsigned nr_effective; /* sum of @ptrs durability */ bool have_cache; /* have we allocated from a 0 durability dev */ struct bch_devs_mask devs_may_alloc; @@ -62,6 +61,13 @@ struct alloc_request { u64 skipped_nouse; u64 skipped_mi_btree_bitmap; } counters; + + unsigned scratch_nr_replicas; + unsigned scratch_nr_effective; + bool scratch_have_cache; + enum bch_data_type scratch_data_type; + struct open_buckets scratch_ptrs; + struct bch_devs_mask scratch_devs_may_alloc; }; struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 11f46dccc14f..37e63137041c 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1720,12 +1720,12 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, unsigned i, j, nr_have_parity = 0, nr_have_data = 0; int ret = 0; - enum bch_data_type saved_data_type = req->data_type; - struct open_buckets saved_ptrs = req->ptrs; - unsigned saved_nr_replicas = req->nr_replicas; - unsigned saved_nr_effective = req->nr_effective; - bool saved_have_cache = req->have_cache; - struct bch_devs_mask saved_devs_may_alloc = req->devs_may_alloc; + req->scratch_data_type = req->data_type; + req->scratch_ptrs = req->ptrs; + req->scratch_nr_replicas = req->nr_replicas; + req->scratch_nr_effective = req->nr_effective; + req->scratch_have_cache = req->have_cache; + req->scratch_devs_may_alloc = req->devs_may_alloc; req->devs_may_alloc = h->devs; req->have_cache = true; @@ -1801,12 +1801,12 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, goto err; } err: - req->data_type = saved_data_type; - req->ptrs = saved_ptrs; - req->nr_replicas = saved_nr_replicas; - req->nr_effective = saved_nr_effective; - req->have_cache = saved_have_cache; - req->devs_may_alloc = saved_devs_may_alloc; + req->data_type = req->scratch_data_type; + req->ptrs = req->scratch_ptrs; + req->nr_replicas = req->scratch_nr_replicas; + req->nr_effective = req->scratch_nr_effective; + req->have_cache = req->scratch_have_cache; + req->devs_may_alloc = req->scratch_devs_may_alloc; return ret; } From ea27e8ca5d8e117b17a3d76c39404d206c5ebdeb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Apr 2025 11:59:39 -0400 Subject: [PATCH 021/218] bcachefs: darray: provide typedefs for primitive types Signed-off-by: Kent Overstreet --- fs/bcachefs/darray.h | 12 +++++++++++- fs/bcachefs/fsck.c | 2 -- fs/bcachefs/journal_types.h | 2 -- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index c6151495985f..88f0ca3f0af5 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -20,7 +20,17 @@ struct { \ #define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) typedef DARRAY(char) darray_char; -typedef DARRAY(char *) darray_str; +typedef DARRAY(char *) darray_str; + +typedef DARRAY(u8) darray_u8; +typedef DARRAY(u16) darray_u16; +typedef DARRAY(u32) darray_u32; +typedef DARRAY(u64) darray_u64; + +typedef DARRAY(s8) darray_s8; +typedef DARRAY(s16) darray_s16; +typedef DARRAY(s32) darray_s32; +typedef DARRAY(s64) darray_s64; int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index aaf187085276..d927fdafd43a 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2445,8 +2445,6 @@ int bch2_check_root(struct bch_fs *c) return ret; } -typedef DARRAY(u32) darray_u32; - static bool darray_u32_has(darray_u32 *d, u32 v) { darray_for_each(*d, i) diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 8e0eba776b9d..51104bbb99da 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -151,8 +151,6 @@ enum journal_flags { #undef x }; -typedef DARRAY(u64) darray_u64; - struct journal_bio { struct bch_dev *ca; unsigned buf_idx; From b974357c63d0b26606210942dc5659d755089d4e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Apr 2025 14:40:06 -0400 Subject: [PATCH 022/218] bcachefs: bch2_snapshot_table_make_room() Add a better helper for check_snapshot_exists(). create_snapids() can't be changed to use this, unfortunately, because the transaction that creates new snapshot will also be inserting other keys (e.g. root inode) that reference that snapshot ID, and they expect the snapshot table to already be updated. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index fec569c7deb1..2eede851572c 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -281,6 +281,16 @@ int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, return ret; } +static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) +{ + mutex_lock(&c->snapshot_table_lock); + int ret = snapshot_t_mut(c, id) + ? 0 + : -BCH_ERR_ENOMEM_mark_snapshot; + mutex_unlock(&c->snapshot_table_lock); + return ret; +} + static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -887,9 +897,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) } bch2_trans_iter_exit(trans, &iter); - return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: - bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); + return bch2_snapshot_table_make_room(c, id) ?: + bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0); } /* Figure out which snapshot nodes belong in the same tree: */ From bcaea61adc1c19094cafbf0269fe99227b2ac89c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 12:18:39 -0400 Subject: [PATCH 023/218] bcachefs: add missing include Hygeine, and fix build in userspace. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 751a9679d7e5..e5b3e987d7bb 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -25,6 +25,7 @@ #include "subvolume.h" #include "trace.h" +#include #include #include From c9b5d9cd26bde01a0591cd8eeed8847da997f576 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 22:30:39 -0400 Subject: [PATCH 024/218] bcachefs: bch2_kvmalloc() mem alloc profiling Signed-off-by: Kent Overstreet --- fs/bcachefs/util.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 3e52c7f8ddd2..ccc1cf699c4b 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -55,15 +55,16 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -static inline void *bch2_kvmalloc(size_t n, gfp_t flags) +static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) { void *p = unlikely(n >= INT_MAX) - ? vmalloc(n) - : kvmalloc(n, flags & ~__GFP_ZERO); + ? vmalloc_noprof(n) + : kvmalloc_noprof(n, flags & ~__GFP_ZERO); if (p && (flags & __GFP_ZERO)) memset(p, 0, n); return p; } +#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__)) #define init_heap(heap, _size, gfp) \ ({ \ From 2767f4f258b8d034a99830fafdb46a8c52910bce Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 5 Apr 2025 19:23:52 -0400 Subject: [PATCH 025/218] bcachefs: btree_io_complete_wq -> btree_write_complete_wq Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 +- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/super.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 24eed2b3be4d..09df91f10c20 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -877,7 +877,7 @@ struct bch_fs { struct btree_write_buffer btree_write_buffer; struct workqueue_struct *btree_update_wq; - struct workqueue_struct *btree_io_complete_wq; + struct workqueue_struct *btree_write_complete_wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; /* diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 60782f3e5aec..69b207502381 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -2184,7 +2184,7 @@ static void btree_node_write_endio(struct bio *bio) smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); INIT_WORK(&wb->work, btree_node_write_work); - queue_work(c->btree_io_complete_wq, &wb->work); + queue_work(c->btree_write_complete_wq, &wb->work); } static int validate_bset_for_write(struct bch_fs *c, struct btree *b, diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 84a37d971ffd..cb3195a4fdb0 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -601,8 +601,8 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); - if (c->btree_io_complete_wq) - destroy_workqueue(c->btree_io_complete_wq); + if (c->btree_write_complete_wq) + destroy_workqueue(c->btree_write_complete_wq); if (c->btree_update_wq) destroy_workqueue(c->btree_update_wq); @@ -876,7 +876,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || - !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", + !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || From 25ee021c7fc22797ac34b9b9fb9b24921b647901 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 5 Apr 2025 19:26:19 -0400 Subject: [PATCH 026/218] bcachefs: simplify journal pin initialization Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bb45d3634194..e10f9b930aa6 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1429,13 +1429,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) */ nr += nr / 4; - if (nr + 1 > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -BCH_ERR_ENOMEM_journal_pin_fifo; - } + nr = max(nr, JOURNAL_PIN); + init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return -BCH_ERR_ENOMEM_journal_pin_fifo; } j->replay_journal_seq = last_seq; @@ -1610,9 +1608,6 @@ int bch2_fs_journal_init(struct journal *j) ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) - return -BCH_ERR_ENOMEM_journal_pin_fifo; - j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); if (!j->free_buf) @@ -1621,8 +1616,6 @@ int bch2_fs_journal_init(struct journal *j) for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; - j->pin.front = j->pin.back = 1; - j->wq = alloc_workqueue("bcachefs_journal", WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); if (!j->wq) From 31813dcf379d7fc513530e3a9cf7b60cd2aa2a9d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 5 Apr 2025 19:41:35 -0400 Subject: [PATCH 027/218] bcachefs: alphabetize init function calls Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index cb3195a4fdb0..93ba6fef40b8 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -780,17 +780,18 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); - bch2_fs_copygc_init(c); - bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); - bch2_fs_btree_iter_init_early(c); - bch2_fs_btree_interior_update_init_early(c); - bch2_fs_journal_keys_init(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); - bch2_fs_rebalance_init(c); - bch2_fs_quota_init(c); + bch2_fs_btree_cache_init_early(&c->btree_cache); + bch2_fs_btree_interior_update_init_early(c); + bch2_fs_btree_iter_init_early(c); + bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_copygc_init(c); bch2_fs_ec_init_early(c); + bch2_fs_journal_keys_init(c); bch2_fs_move_init(c); + bch2_fs_quota_init(c); + bch2_fs_rebalance_init(c); bch2_fs_sb_errors_init_early(c); INIT_LIST_HEAD(&c->list); @@ -817,8 +818,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; - bch2_fs_btree_cache_init_early(&c->btree_cache); - mutex_init(&c->sectors_available_lock); ret = percpu_init_rwsem(&c->mark_lock); @@ -905,29 +904,30 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } - ret = bch2_fs_counters_init(c) ?: - bch2_fs_sb_errors_init(c) ?: + ret = + bch2_fs_btree_cache_init(c) ?: + bch2_fs_btree_gc_init(c) ?: + bch2_fs_btree_iter_init(c) ?: + bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: + bch2_fs_btree_write_buffer_init(c) ?: + bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: - bch2_fs_journal_init(&c->journal) ?: - bch2_fs_btree_iter_init(c) ?: - bch2_fs_btree_cache_init(c) ?: - bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_interior_update_init(c) ?: - bch2_fs_btree_gc_init(c) ?: - bch2_fs_buckets_waiting_for_journal_init(c) ?: - bch2_fs_btree_write_buffer_init(c) ?: - bch2_fs_subvolumes_init(c) ?: - bch2_fs_io_read_init(c) ?: - bch2_fs_io_write_init(c) ?: - bch2_fs_nocow_locking_init(c) ?: - bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: + bch2_fs_counters_init(c) ?: bch2_fs_ec_init(c) ?: - bch2_fs_vfs_init(c) ?: + bch2_fs_encryption_init(c) ?: bch2_fs_fsio_init(c) ?: bch2_fs_fs_io_buffered_init(c) ?: - bch2_fs_fs_io_direct_init(c); + bch2_fs_fs_io_direct_init(c) ?: + bch2_fs_io_read_init(c) ?: + bch2_fs_io_write_init(c) ?: + bch2_fs_journal_init(&c->journal) ?: + bch2_fs_nocow_locking_init(c) ?: + bch2_fs_sb_errors_init(c) ?: + bch2_fs_subvolumes_init(c) ?: + bch2_fs_vfs_init(c); if (ret) goto err; From a17e985be9831bf866795fe5e3da219d2061ce6c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 5 Apr 2025 19:30:43 -0400 Subject: [PATCH 028/218] bcachefs: Move various init code to _init_early() _init_early() is for initialization that cannot fail, and often must happen for teardown partway through initialization to work. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 7 +------ fs/bcachefs/btree_gc.h | 3 +-- fs/bcachefs/btree_write_buffer.c | 7 ++++++- fs/bcachefs/btree_write_buffer.h | 1 + fs/bcachefs/journal.c | 5 ++++- fs/bcachefs/journal.h | 1 + fs/bcachefs/nocow_locking.c | 4 +--- fs/bcachefs/nocow_locking.h | 2 +- fs/bcachefs/subvolume.c | 3 +-- fs/bcachefs/subvolume.h | 2 +- fs/bcachefs/super.c | 9 +++++---- 11 files changed, 23 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 37b69d89341f..1f02d28c175c 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1266,16 +1266,11 @@ void bch2_gc_gens_async(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -void bch2_fs_btree_gc_exit(struct bch_fs *c) -{ -} - -int bch2_fs_btree_gc_init(struct bch_fs *c) +void bch2_fs_btree_gc_init_early(struct bch_fs *c) { seqcount_init(&c->gc_pos_lock); INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); - return 0; } diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 9693a90a48a2..ec77662369a2 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -83,7 +83,6 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); int bch2_gc_gens(struct bch_fs *); void bch2_gc_gens_async(struct bch_fs *); -void bch2_fs_btree_gc_exit(struct bch_fs *); -int bch2_fs_btree_gc_init(struct bch_fs *); +void bch2_fs_btree_gc_init_early(struct bch_fs *); #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 0941fb2c026d..68ab48af40f0 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -866,13 +866,18 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) darray_exit(&wb->inc.keys); } -int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; mutex_init(&wb->inc.lock); mutex_init(&wb->flushing.lock); INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); +} + +int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; /* Will be resized by journal as needed: */ unsigned initial_size = 1 << 16; diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index d535cea28bde..05f56fd1eed0 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -101,6 +101,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_t int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); +void bch2_fs_btree_write_buffer_init_early(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index e10f9b930aa6..7522a618b9c9 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1588,7 +1588,7 @@ void bch2_fs_journal_exit(struct journal *j) free_fifo(&j->pin); } -int bch2_fs_journal_init(struct journal *j) +void bch2_fs_journal_init_early(struct journal *j) { static struct lock_class_key res_key; @@ -1607,7 +1607,10 @@ int bch2_fs_journal_init(struct journal *j) atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); +} +int bch2_fs_journal_init(struct journal *j) +{ j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); if (!j->free_buf) diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 641e20c05a14..886ffd9c0db6 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -458,6 +458,7 @@ void bch2_journal_set_replay_done(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); +void bch2_fs_journal_init_early(struct journal *); int bch2_fs_journal_init(struct journal *); #endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c index 3c21981a4a1c..962218fa68ec 100644 --- a/fs/bcachefs/nocow_locking.c +++ b/fs/bcachefs/nocow_locking.c @@ -133,12 +133,10 @@ void bch2_fs_nocow_locking_exit(struct bch_fs *c) BUG_ON(atomic_read(&l->l[j])); } -int bch2_fs_nocow_locking_init(struct bch_fs *c) +void bch2_fs_nocow_locking_init_early(struct bch_fs *c) { struct bucket_nocow_lock_table *t = &c->nocow_locks; for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) spin_lock_init(&l->lock); - - return 0; } diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h index f9d6a426a960..48b8a003c0d2 100644 --- a/fs/bcachefs/nocow_locking.h +++ b/fs/bcachefs/nocow_locking.h @@ -45,6 +45,6 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); void bch2_fs_nocow_locking_exit(struct bch_fs *); -int bch2_fs_nocow_locking_init(struct bch_fs *); +void bch2_fs_nocow_locking_init_early(struct bch_fs *); #endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 239ea783698c..0421ffc1128f 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -714,11 +714,10 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) return ret; } -int bch2_fs_subvolumes_init(struct bch_fs *c) +void bch2_fs_subvolumes_init_early(struct bch_fs *c) { INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, bch2_subvolume_wait_for_pagecache_and_delete); mutex_init(&c->snapshots_unlinked_lock); - return 0; } diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index f640c1e3d639..ee5e4e5a0fc8 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -86,6 +86,6 @@ int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, boo int bch2_initialize_subvolumes(struct bch_fs *); int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); -int bch2_fs_subvolumes_init(struct bch_fs *); +void bch2_fs_subvolumes_init_early(struct bch_fs *); #endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 93ba6fef40b8..9cff32bde7a4 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -567,7 +567,6 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); - bch2_fs_btree_gc_exit(c); bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); BUG_ON(atomic_read(&c->journal_keys.ref)); @@ -783,16 +782,21 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); bch2_fs_btree_cache_init_early(&c->btree_cache); + bch2_fs_btree_gc_init_early(c); bch2_fs_btree_interior_update_init_early(c); bch2_fs_btree_iter_init_early(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_write_buffer_init_early(c); bch2_fs_copygc_init(c); bch2_fs_ec_init_early(c); + bch2_fs_journal_init_early(&c->journal); bch2_fs_journal_keys_init(c); bch2_fs_move_init(c); + bch2_fs_nocow_locking_init_early(c); bch2_fs_quota_init(c); bch2_fs_rebalance_init(c); bch2_fs_sb_errors_init_early(c); + bch2_fs_subvolumes_init_early(c); INIT_LIST_HEAD(&c->list); @@ -906,7 +910,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ret = bch2_fs_btree_cache_init(c) ?: - bch2_fs_btree_gc_init(c) ?: bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: @@ -924,9 +927,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_io_read_init(c) ?: bch2_fs_io_write_init(c) ?: bch2_fs_journal_init(&c->journal) ?: - bch2_fs_nocow_locking_init(c) ?: bch2_fs_sb_errors_init(c) ?: - bch2_fs_subvolumes_init(c) ?: bch2_fs_vfs_init(c); if (ret) goto err; From d4d71b58e5139afc5f9bda0139b99404eb216d8a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 5 Apr 2025 17:36:04 -0400 Subject: [PATCH 029/218] bcachefs: RO mounts now use less memory Defer memory allocations only needed in RW mode until we actually go RW. This is part of improved support for RO images. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/io_read.c | 8 +++++++ fs/bcachefs/io_write.c | 8 ------- fs/bcachefs/super.c | 51 +++++++++++++++++++++++++++++------------- 4 files changed, 44 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 09df91f10c20..1e40ad2a7bce 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -614,6 +614,7 @@ struct bch_dev { x(accounting_replay_done) \ x(may_go_rw) \ x(rw) \ + x(rw_init_done) \ x(was_rw) \ x(stopping) \ x(emergency_ro) \ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index e5b3e987d7bb..e490f136d63d 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1490,10 +1490,18 @@ void bch2_fs_io_read_exit(struct bch_fs *c) rhashtable_destroy(&c->promote_table); bioset_exit(&c->bio_read_split); bioset_exit(&c->bio_read); + mempool_exit(&c->bio_bounce_pages); } int bch2_fs_io_read_init(struct bch_fs *c) { + if (mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->opts.encoded_extent_max) / + PAGE_SIZE, 0)) + return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) return -BCH_ERR_ENOMEM_bio_read_init; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index c1237da079ed..401347e135b7 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1744,7 +1744,6 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) void bch2_fs_io_write_exit(struct bch_fs *c) { - mempool_exit(&c->bio_bounce_pages); bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } @@ -1755,12 +1754,5 @@ int bch2_fs_io_write_init(struct bch_fs *c) bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) return -BCH_ERR_ENOMEM_bio_write_init; - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) - return -BCH_ERR_ENOMEM_bio_bounce_pages_init; - return 0; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 9cff32bde7a4..834ba091e84f 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -183,6 +183,7 @@ static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); static void bch2_dev_io_ref_stop(struct bch_dev *, int); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); +static int bch2_fs_init_rw(struct bch_fs *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { @@ -439,6 +440,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch_info(c, "going read-write"); + ret = bch2_fs_init_rw(c); + if (ret) + goto err; + ret = bch2_sb_members_v2_init(c); if (ret) goto err; @@ -736,6 +741,35 @@ static int bch2_fs_online(struct bch_fs *c) return ret; } +static int bch2_fs_init_rw(struct bch_fs *c) +{ + if (test_bit(BCH_FS_rw_init_done, &c->flags)) + return 0; + + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || + !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", + WQ_FREEZABLE, 0))) + return -BCH_ERR_ENOMEM_fs_other_alloc; + + int ret = bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_btree_write_buffer_init(c) ?: + bch2_fs_fs_io_buffered_init(c) ?: + bch2_fs_io_write_init(c) ?: + bch2_fs_journal_init(&c->journal); + if (ret) + return ret; + + set_bit(BCH_FS_rw_init_done, &c->flags); + return 0; +} + static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) { struct bch_fs *c; @@ -877,18 +911,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || - !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", + if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || - !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", - WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || @@ -911,9 +935,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ret = bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_iter_init(c) ?: - bch2_fs_btree_interior_update_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: @@ -922,11 +944,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_ec_init(c) ?: bch2_fs_encryption_init(c) ?: bch2_fs_fsio_init(c) ?: - bch2_fs_fs_io_buffered_init(c) ?: bch2_fs_fs_io_direct_init(c) ?: bch2_fs_io_read_init(c) ?: - bch2_fs_io_write_init(c) ?: - bch2_fs_journal_init(&c->journal) ?: bch2_fs_sb_errors_init(c) ?: bch2_fs_vfs_init(c); if (ret) From 3a2a0d08b225047ac1d2504059c45a5acf8072b8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 20:56:09 -0400 Subject: [PATCH 030/218] bcachefs: move_data_phys: stats are not required Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index fe2fa665150b..a4678a205da6 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -916,8 +916,10 @@ static int bch2_move_data_phys(struct bch_fs *c, bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ctxt.stats->phys = true; - ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; + if (ctxt.stats) { + ctxt.stats->phys = true; + ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; + } int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); bch2_moving_ctxt_exit(&ctxt); From 0e790469bf3044022cb02025abdae775c9908ca8 Mon Sep 17 00:00:00 2001 From: Integral Date: Sun, 6 Apr 2025 22:53:28 +0800 Subject: [PATCH 031/218] bcachefs: early return for negative values when parsing BCH_OPT_UINT Currently, when passing a negative integer as argument, the error message is "too big" due to casting to an unsigned integer: > bcachefs format --block_size=-1 bcachefs.img invalid option: block_size: too big (max 65536) When negative value in argument detected, return early before calling bch2_opt_validate(). A new error code `BCH_ERR_option_negative` is added. Signed-off-by: Integral Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 + fs/bcachefs/opts.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index d9ebffa5b3a2..768b176f6ea8 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -211,6 +211,7 @@ x(EINVAL, inode_unpack_error) \ x(EINVAL, varint_decode_error) \ x(EINVAL, erasure_coding_found_btree_node) \ + x(EINVAL, option_negative) \ x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index af3258814822..f40de111e527 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -360,9 +360,15 @@ int bch2_opt_parse(struct bch_fs *c, return -EINVAL; } - ret = opt->flags & OPT_HUMAN_READABLE - ? bch2_strtou64_h(val, res) - : kstrtou64(val, 10, res); + if (*val != '-') { + ret = opt->flags & OPT_HUMAN_READABLE + ? bch2_strtou64_h(val, res) + : kstrtou64(val, 10, res); + } else { + prt_printf(err, "%s: must be a non-negative number", opt->attr.name); + return -BCH_ERR_option_negative; + } + if (ret < 0) { if (err) prt_printf(err, "%s: must be a number", From 84ccd47d265579dd23768e69b5204801ad6b5eca Mon Sep 17 00:00:00 2001 From: Integral Date: Sun, 6 Apr 2025 23:26:59 +0800 Subject: [PATCH 032/218] bcachefs: split error messages of invalid compression into two lines When an invalid compression type or level is passed as an argument to `--compression`, two error messages are squashed into one line: > bcachefs format --compression=lzo bcachefs-comp.img invalid option: invalid compression typecompression: parse error > bcachefs format --compression=lz4:16 bcachefs-comp.img invalid option: invalid compression levelcompression: parse error To resolve this issue, add a newline character at the end of the first error message to separate them into two lines. Signed-off-by: Integral Signed-off-by: Kent Overstreet --- fs/bcachefs/compress.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 28ed32449913..d68c3c7896a3 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -714,7 +714,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, ret = match_string(bch2_compression_opts, -1, type_str); if (ret < 0 && err) - prt_str(err, "invalid compression type"); + prt_str(err, "invalid compression type\n"); if (ret < 0) goto err; @@ -729,7 +729,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, if (!ret && level > 15) ret = -EINVAL; if (ret < 0 && err) - prt_str(err, "invalid compression level"); + prt_str(err, "invalid compression level\n"); if (ret < 0) goto err; From dd1b99f7060f781bea9973036ee96daa856f08c0 Mon Sep 17 00:00:00 2001 From: Integral Date: Tue, 8 Apr 2025 18:31:29 +0800 Subject: [PATCH 033/218] bcachefs: indent error messages of invalid compression This patch uses printbuf_indent_add_nextline() to set a consistent indentation level for error messages of invalid compression. In my previous patch [1], the newline is added by using '\n' in the argument of prt_str(). This patch replaces prt_str() with prt_printf() to make indentation level work correctly. [1] Link: https://lore.kernel.org/20250406152659.205997-2-integral@archlinuxcn.org Signed-off-by: Integral Signed-off-by: Kent Overstreet --- fs/bcachefs/compress.c | 4 ++-- fs/bcachefs/opts.c | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index d68c3c7896a3..1bca61d17092 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -714,7 +714,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, ret = match_string(bch2_compression_opts, -1, type_str); if (ret < 0 && err) - prt_str(err, "invalid compression type\n"); + prt_printf(err, "invalid compression type\n"); if (ret < 0) goto err; @@ -729,7 +729,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, if (!ret && level > 15) ret = -EINVAL; if (ret < 0 && err) - prt_str(err, "invalid compression level\n"); + prt_printf(err, "invalid compression level\n"); if (ret < 0) goto err; diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index f40de111e527..ed2e2850c1d1 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -337,6 +337,9 @@ int bch2_opt_parse(struct bch_fs *c, { ssize_t ret; + if (err) + printbuf_indent_add_nextline(err, 2); + switch (opt->type) { case BCH_OPT_BOOL: if (val) { From 2758c28acabc16317f690874ac930aae50a4e461 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 05:23:40 -0400 Subject: [PATCH 034/218] bcachefs: export bch2_chacha20 Needed for userspcae. Signed-off-by: Kent Overstreet --- fs/bcachefs/checksum.c | 4 ++-- fs/bcachefs/checksum.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index d0a34a097b80..ff5ab8ada777 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -106,8 +106,8 @@ static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS], memzero_explicit(key_words, sizeof(key_words)); } -static void bch2_chacha20(const struct bch_key *key, struct nonce nonce, - void *data, size_t len) +void bch2_chacha20(const struct bch_key *key, struct nonce nonce, + void *data, size_t len) { u32 state[CHACHA_STATE_WORDS]; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 1310782d3ae9..7bd9cf6104ca 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -69,6 +69,8 @@ static inline void bch2_csum_err_msg(struct printbuf *out, bch2_csum_to_text(out, type, expected); } +void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t); + int bch2_request_key(struct bch_sb *, struct bch_key *); #ifndef __KERNEL__ int bch2_revoke_key(struct bch_sb *); From ef8dd631f788810e19138771a7e72956467bef0f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 6 Apr 2025 13:50:20 -0400 Subject: [PATCH 035/218] bcachefs: Improve opts.degraded Kill 'opts.very_degraded', and make 'opts.degraded' a persistent option, stored in the superblock. It's now an enum, with available choices ask/yes/very/no. "ask" mode will be handled by the mount helper, for prompting the user (on a machine used interactively) for whether to do a degraded mount. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 14 ++++++++ fs/bcachefs/opts.c | 67 ++++++++++++++++++++++++++--------- fs/bcachefs/opts.h | 10 ++---- fs/bcachefs/super-io.c | 2 +- fs/bcachefs/super.c | 18 +++++----- 5 files changed, 77 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index d6e4a496f02b..6cbc267445b7 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -867,6 +867,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); +LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) @@ -989,6 +990,19 @@ enum bch_error_actions { BCH_ON_ERROR_NR }; +#define BCH_DEGRADED_ACTIONS() \ + x(ask, 0) \ + x(yes, 1) \ + x(very, 2) \ + x(no, 3) + +enum bch_degraded_actions { +#define x(t, n) BCH_DEGRADED_##t = n, + BCH_DEGRADED_ACTIONS() +#undef x + BCH_DEGRADED_ACTIONS_NR +}; + #define BCH_STR_HASH_TYPES() \ x(crc32c, 0) \ x(crc64, 1) \ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index ed2e2850c1d1..b3fcffc91d6f 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -19,6 +19,11 @@ const char * const bch2_error_actions[] = { NULL }; +const char * const bch2_degraded_actions[] = { + BCH_DEGRADED_ACTIONS() + NULL +}; + const char * const bch2_fsck_fix_opts[] = { BCH_FIX_ERRORS_OPTS() NULL @@ -273,20 +278,20 @@ int bch2_opt_lookup(const char *name) return -1; } -struct synonym { +struct opt_synonym { const char *s1, *s2; }; -static const struct synonym bch_opt_synonyms[] = { +static const struct opt_synonym bch2_opt_synonyms[] = { { "quota", "usrquota" }, }; static int bch2_mount_opt_lookup(const char *name) { - const struct synonym *i; + const struct opt_synonym *i; - for (i = bch_opt_synonyms; - i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); + for (i = bch2_opt_synonyms; + i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms); i++) if (!strcmp(name, i->s1)) name = i->s2; @@ -294,6 +299,30 @@ static int bch2_mount_opt_lookup(const char *name) return bch2_opt_lookup(name); } +struct opt_val_synonym { + const char *opt, *v1, *v2; +}; + +static const struct opt_val_synonym bch2_opt_val_synonyms[] = { + { "degraded", "true", "yes" }, + { "degraded", "false", "no" }, + { "degraded", "1", "yes" }, + { "degraded", "0", "no" }, +}; + +static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val) +{ + const struct opt_val_synonym *i; + + for (i = bch2_opt_val_synonyms; + i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms); + i++) + if (!strcmp(opt, i->opt) && !strcmp(val, i->v1)) + return i->v2; + + return val; +} + int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) { if (v < opt->min) { @@ -342,19 +371,17 @@ int bch2_opt_parse(struct bch_fs *c, switch (opt->type) { case BCH_OPT_BOOL: - if (val) { - ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); - if (ret != -BCH_ERR_option_not_bool) { - *res = ret; - } else { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; - } - } else { - *res = 1; - } + if (!val) + val = "1"; + ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); + if (ret != -BCH_ERR_option_not_bool) { + *res = ret; + } else { + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; + } break; case BCH_OPT_UINT: if (!val) { @@ -545,6 +572,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, if (id < 0) return 0; + /* must have a value for synonym lookup - but OPT_FN is weird */ + if (!val && bch2_opt_table[id].type != BCH_OPT_FN) + val = "1"; + + val = bch2_opt_val_synonym_lookup(name, val); + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) goto bad_opt; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index dfb14810124c..cbb13e91789d 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -11,6 +11,7 @@ struct bch_fs; extern const char * const bch2_error_actions[]; +extern const char * const bch2_degraded_actions[]; extern const char * const bch2_fsck_fix_opts[]; extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; @@ -307,14 +308,9 @@ enum fsck_err_opts { NULL, "Enable project quotas") \ x(degraded, u8, \ OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + OPT_STR(bch2_degraded_actions), \ + BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \ NULL, "Allow mounting in degraded mode") \ - x(very_degraded, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Allow mounting in when data will be missing") \ x(no_splitbrain_check, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index cb5d960aed92..adfcd8a92b93 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1022,7 +1022,7 @@ int bch2_write_super(struct bch_fs *c) trace_and_count(c, write_super, c, _RET_IP_); - if (c->opts.very_degraded) + if (c->opts.degraded == BCH_DEGRADED_very) degraded_flags |= BCH_FORCE_IF_LOST; lockdep_assert_held(&c->sb_lock); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 834ba091e84f..027e10766185 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1040,19 +1040,18 @@ static void print_mount_opts(struct bch_fs *c) static bool bch2_fs_may_start(struct bch_fs *c) { struct bch_dev *ca; - unsigned i, flags = 0; + unsigned flags = 0; - if (c->opts.very_degraded) + switch (c->opts.degraded) { + case BCH_DEGRADED_very: flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - - if (c->opts.degraded) + break; + case BCH_DEGRADED_yes: flags |= BCH_FORCE_IF_DEGRADED; - - if (!c->opts.degraded && - !c->opts.very_degraded) { + break; + default: mutex_lock(&c->sb_lock); - - for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue; @@ -1066,6 +1065,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) } } mutex_unlock(&c->sb_lock); + break; } return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); From 68aaeb7c8bc8fd82805b5045c606bc5fe00cbea8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 07:42:46 -0400 Subject: [PATCH 036/218] bcachefs: kill BTREE_CACHE_NOT_FREED_INCREMENT() Small cleanup, just always increment the counters. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 46 +++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 899891295797..560c29536293 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -17,12 +17,6 @@ #include #include -#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ -do { \ - if (shrinker_counter) \ - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ -} while (0) - const char * const bch2_btree_node_flags[] = { "typebit", "typebit", @@ -354,7 +348,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) { struct btree_cache *bc = &c->btree_cache; int ret = 0; @@ -366,11 +360,11 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b (1U << BTREE_NODE_write_in_flight))) { if (!flush) { if (btree_node_dirty(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; else if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; return -BCH_ERR_ENOMEM_btree_node_reclaim; } @@ -380,12 +374,12 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b } if (!six_trylock_intent(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; return -BCH_ERR_ENOMEM_btree_node_reclaim; } if (!six_trylock_write(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; goto out_unlock_intent; } @@ -394,9 +388,9 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b (1U << BTREE_NODE_write_in_flight))) { if (!flush) { if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; goto out_unlock; } six_unlock_write(&b->c.lock); @@ -405,21 +399,21 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b } if (btree_node_noevict(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(noevict); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; goto out_unlock; } if (btree_node_write_blocked(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; goto out_unlock; } if (btree_node_will_make_reachable(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; goto out_unlock; } if (btree_node_dirty(b)) { if (!flush) { - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; goto out_unlock; } /* @@ -451,14 +445,14 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b goto out; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, false, shrinker_counter); + return __btree_node_reclaim(c, b, false); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true, false); + return __btree_node_reclaim(c, b, true); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, @@ -506,7 +500,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (touched >= nr) goto out; - if (!btree_node_reclaim(c, b, true)) { + if (!btree_node_reclaim(c, b)) { btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -522,7 +516,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, clear_btree_node_accessed(b); bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; --touched;; - } else if (!btree_node_reclaim(c, b, true)) { + } else if (!btree_node_reclaim(c, b)) { __bch2_btree_node_hash_remove(bc, b); __btree_node_data_free(bc, b); @@ -755,7 +749,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) list_for_each_entry_reverse(b, &bc->live[i].list, list) - if (!btree_node_reclaim(c, b, false)) + if (!btree_node_reclaim(c, b)) return b; while (1) { @@ -790,7 +784,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b, false)) { + if (!btree_node_reclaim(c, b)) { list_del_init(&b->list); goto got_node; } @@ -817,7 +811,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2, false)) { + if (!btree_node_reclaim(c, b2)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); From e50fe14c5430d30c5464ebdaef7d0e9684480aca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 07:45:13 -0400 Subject: [PATCH 037/218] bcachefs: __btree_node_reclaim_checks() Factor out a helper so we're not duplicating checks after locking the btree node. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 147 +++++++++++++++++++------------------- 1 file changed, 75 insertions(+), 72 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 560c29536293..9d6f78e9600e 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -344,6 +344,69 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); } +static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, + bool flush, bool locked) +{ + struct btree_cache *bc = &c->btree_cache; + + lockdep_assert_held(&bc->lock); + + if (btree_node_noevict(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + if (btree_node_write_blocked(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + if (btree_node_will_make_reachable(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + + if (btree_node_dirty(b)) { + if (!flush) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + + if (locked) { + /* + * Using the underscore version because we don't want to compact + * bsets after the write, since this node is about to be evicted + * - unless btree verify mode is enabled, since it runs out of + * the post write cleanup: + */ + if (bch2_verify_btree_ondisk) + bch2_btree_node_write(c, b, SIX_LOCK_intent, + BTREE_WRITE_cache_reclaim); + else + __bch2_btree_node_write(c, b, + BTREE_WRITE_cache_reclaim); + } + } + + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) { + if (btree_node_read_in_flight(b)) + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; + else if (btree_node_write_in_flight(b)) + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + + if (locked) + return -EINTR; + + /* XXX: waiting on IO with btree cache lock held */ + bch2_btree_node_wait_on_read(b); + bch2_btree_node_wait_on_write(b); + } + + return 0; +} + /* * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) @@ -354,24 +417,10 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) int ret = 0; lockdep_assert_held(&bc->lock); -wait_on_io: - if (b->flags & ((1U << BTREE_NODE_dirty)| - (1U << BTREE_NODE_read_in_flight)| - (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_dirty(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; - else if (btree_node_read_in_flight(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; - else if (btree_node_write_in_flight(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; - } - - /* XXX: waiting on IO with btree cache lock held */ - bch2_btree_node_wait_on_read(b); - bch2_btree_node_wait_on_write(b); - } +retry_unlocked: + ret = __btree_node_reclaim_checks(c, b, flush, false); + if (ret) + return ret; if (!six_trylock_intent(&b->c.lock)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; @@ -380,69 +429,23 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) if (!six_trylock_write(&b->c.lock)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; - goto out_unlock_intent; + six_unlock_intent(&b->c.lock); + return -BCH_ERR_ENOMEM_btree_node_reclaim; } /* recheck under lock */ - if (b->flags & ((1U << BTREE_NODE_read_in_flight)| - (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_read_in_flight(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; - else if (btree_node_write_in_flight(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; - goto out_unlock; - } + ret = __btree_node_reclaim_checks(c, b, flush, true); + if (ret) { six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); - goto wait_on_io; + if (ret == -EINTR) + goto retry_unlocked; + return ret; } - if (btree_node_noevict(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; - goto out_unlock; - } - if (btree_node_write_blocked(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; - goto out_unlock; - } - if (btree_node_will_make_reachable(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; - goto out_unlock; - } - - if (btree_node_dirty(b)) { - if (!flush) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; - goto out_unlock; - } - /* - * Using the underscore version because we don't want to compact - * bsets after the write, since this node is about to be evicted - * - unless btree verify mode is enabled, since it runs out of - * the post write cleanup: - */ - if (bch2_verify_btree_ondisk) - bch2_btree_node_write(c, b, SIX_LOCK_intent, - BTREE_WRITE_cache_reclaim); - else - __bch2_btree_node_write(c, b, - BTREE_WRITE_cache_reclaim); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - goto wait_on_io; - } -out: if (b->hash_val && !ret) trace_and_count(c, btree_cache_reap, c, b); - return ret; -out_unlock: - six_unlock_write(&b->c.lock); -out_unlock_intent: - six_unlock_intent(&b->c.lock); - ret = -BCH_ERR_ENOMEM_btree_node_reclaim; - goto out; + return 0; } static int btree_node_reclaim(struct bch_fs *c, struct btree *b) From 93ac4d5f92fc33dc3ba6a60a66e177ac1f4be032 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 07:47:47 -0400 Subject: [PATCH 038/218] bcachefs: Improve bch2_btree_cache_to_text() Make the output slightly clearer, and include a counter for "nodes we couldn't free because we would have gone under our reserve". Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 11 ++++++++--- fs/bcachefs/btree_types.h | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 9d6f78e9600e..66f18b7cdd40 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -487,7 +487,10 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * IO can always make forward progress: */ can_free = btree_cache_can_free(list); - nr = min_t(unsigned long, nr, can_free); + if (nr > can_free) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; + nr = can_free; + } i = 0; list_for_each_entry_safe(b, t, &bc->freeable, list) { @@ -1489,9 +1492,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc prt_btree_cache_line(out, c, "live:", bc->live[0].nr); prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); - prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); + prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); + prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); prt_newline(out); for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { @@ -1502,6 +1506,7 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc } prt_newline(out); + prt_printf(out, "counters since mount:\n"); prt_printf(out, "freed:\t%zu\n", bc->nr_freed); prt_printf(out, "not freed:\n"); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 81175c1344d2..325b9834134a 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -139,6 +139,7 @@ struct btree { }; #define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ + x(cache_reserve) \ x(lock_intent) \ x(lock_write) \ x(dirty) \ From 2e0d51d00e84790dfad4191e0f50c33b7cac387c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 13:52:12 -0400 Subject: [PATCH 039/218] bcachefs: bch2_dev_journal_alloc() now respects data_allowed Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 7522a618b9c9..366b5493ecf7 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1298,6 +1298,9 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { + if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) + return 0; + unsigned nr; int ret; From 03f8f9a1292ed3a7160a497f7a36a37d0062b1ca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 16:29:36 -0400 Subject: [PATCH 040/218] bcachefs: bch2_dev_allocator_set_rw() Add a helper that lets us change bch_member.data_allowed at runtime. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 18 +++++++++++------- fs/bcachefs/alloc_background.h | 1 + 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 94ea9e49aec4..8b8c2344855f 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2593,15 +2593,22 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) return ret; } +void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) +{ + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + if (rw && (ca->mi.data_allowed & BIT(i))) + set_bit(ca->dev_idx, c->rw_devs[i].d); + else + clear_bit(ca->dev_idx, c->rw_devs[i].d); +} + /* device goes ro: */ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); /* First, remove device from allocation groups: */ - - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - clear_bit(ca->dev_idx, c->rw_devs[i].d); + bch2_dev_allocator_set_rw(c, ca, false); c->rw_devs_change_count++; @@ -2635,10 +2642,7 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (ca->mi.data_allowed & (1 << i)) - set_bit(ca->dev_idx, c->rw_devs[i].d); - + bch2_dev_allocator_set_rw(c, ca, true); c->rw_devs_change_count++; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 34b3d6ac4fbb..4f94c6a661bf 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -350,6 +350,7 @@ int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); void bch2_recalc_capacity(struct bch_fs *); u64 bch2_min_rw_member_capacity(struct bch_fs *); +void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); From 6f03e30e7c6b759c7e9a67ca1c41f896db7b421a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 17:59:10 -0400 Subject: [PATCH 041/218] bcachefs: Clean up duplicated code in bch2_journal_halt() It's now a wrapper around bch2_journal_halt_locked(). Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 17 +++++++---------- fs/bcachefs/journal.h | 2 +- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 366b5493ecf7..a51ad32931b8 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -331,16 +331,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); } -void bch2_journal_halt(struct journal *j) -{ - spin_lock(&j->lock); - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); - if (!j->err_seq) - j->err_seq = journal_cur_seq(j); - journal_wake(j); - spin_unlock(&j->lock); -} - void bch2_journal_halt_locked(struct journal *j) { lockdep_assert_held(&j->lock); @@ -351,6 +341,13 @@ void bch2_journal_halt_locked(struct journal *j) journal_wake(j); } +void bch2_journal_halt(struct journal *j) +{ + spin_lock(&j->lock); + bch2_journal_halt_locked(j); + spin_unlock(&j->lock); +} + static bool journal_entry_want_write(struct journal *j) { bool ret = !journal_entry_is_open(j) || diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 886ffd9c0db6..8ff00a0ec778 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -426,8 +426,8 @@ int bch2_journal_flush(struct journal *); bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); -void bch2_journal_halt(struct journal *); void bch2_journal_halt_locked(struct journal *); +void bch2_journal_halt(struct journal *); static inline int bch2_journal_error(struct journal *j) { From f013b4ca356d37b2cc2e84a096c02dad592b1574 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 15 Apr 2025 13:33:04 +0800 Subject: [PATCH 042/218] bcachefs: Kill bch2_trans_unlock_noassert Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 2 +- fs/bcachefs/btree_locking.c | 7 ------- fs/bcachefs/btree_locking.h | 1 - 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 66f18b7cdd40..e48089252bb9 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -977,7 +977,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, /* Unlock before doing IO: */ six_unlock_intent(&b->c.lock); - bch2_trans_unlock_noassert(trans); + bch2_trans_unlock(trans); bch2_btree_node_read(trans, b, sync); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 94eb2b73a843..f4f563944340 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -799,13 +799,6 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) return __bch2_trans_relock(trans, false); } -void bch2_trans_unlock_noassert(struct btree_trans *trans) -{ - __bch2_trans_unlock(trans); - - trans_set_unlocked(trans); -} - void bch2_trans_unlock(struct btree_trans *trans) { __bch2_trans_unlock(trans); diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index b33ab7af8440..66b27c0853a5 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -15,7 +15,6 @@ void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); -void bch2_trans_unlock_noassert(struct btree_trans *); void bch2_trans_unlock_write(struct btree_trans *); static inline bool is_btree_node(struct btree_path *path, unsigned l) From 152bae193c480d53283541f5e4e2d1100f8500d3 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 15 Apr 2025 13:33:06 +0800 Subject: [PATCH 043/218] bcachefs: Remove spurious +1/-1 operation Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 325b9834134a..3acccca3b3a3 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -657,13 +657,13 @@ static inline struct bset_tree *bset_tree_last(struct btree *b) static inline void * __btree_node_offset_to_ptr(const struct btree *b, u16 offset) { - return (void *) ((u64 *) b->data + 1 + offset); + return (void *) ((u64 *) b->data + offset); } static inline u16 __btree_node_ptr_to_offset(const struct btree *b, const void *p) { - u16 ret = (u64 *) p - 1 - (u64 *) b->data; + u16 ret = (u64 *) p - (u64 *) b->data; EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); return ret; From 0e43bf5a6a8f440f887b0472367a824c37a4031b Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 15 Apr 2025 13:33:07 +0800 Subject: [PATCH 044/218] bcachefs: Simplify logic Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 69b207502381..4832ac31392a 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1018,7 +1018,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool used_mempool, blacklisted; bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); - unsigned u64s; unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); u64 max_journal_seq = 0; struct printbuf buf = PRINTBUF; @@ -1225,23 +1224,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); sorted->keys.u64s = 0; - set_btree_bset(b, b->set, &b->data->keys); - b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, btree_buf_bytes(b) - sizeof(struct btree_node) - b->nr.live_u64s * sizeof(u64)); - u64s = le16_to_cpu(sorted->keys.u64s); + b->data->keys.u64s = sorted->keys.u64s; *sorted = *b->data; - sorted->keys.u64s = cpu_to_le16(u64s); swap(sorted, b->data); set_btree_bset(b, b->set, &b->data->keys); b->nsets = 1; b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); - BUG_ON(b->nr.live_u64s != u64s); + BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s)); btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); From 58c36e6710d3b3cc050c35c42954cb03c3e01c59 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 13:10:03 -0400 Subject: [PATCH 045/218] bcachefs: Initialize c->name earlier on single dev filesystems On single device filesystems, c->name contains the block device name, not the UUID. Initialize this earlier, so that single device mode can use it for initializing sysfs/debugfs. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 027e10766185..75287aa2ae7f 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -75,6 +75,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet "); MODULE_DESCRIPTION("bcachefs filesystem"); +typedef DARRAY(struct bch_sb_handle) bch_sb_handles; + const char * const bch2_fs_flag_strs[] = { #define x(n) #n, BCH_FS_FLAGS() @@ -770,7 +772,8 @@ static int bch2_fs_init_rw(struct bch_fs *c) return 0; } -static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) +static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts, + bch_sb_handles *sbs) { struct bch_fs *c; struct printbuf name = PRINTBUF; @@ -869,14 +872,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - pr_uuid(&name, c->sb.user_uuid.b); - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; - if (ret) - goto err; - - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) @@ -907,6 +902,18 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } + if (sbs->nr != 1) + pr_uuid(&name, c->sb.user_uuid.b); + else + prt_bdevname(&name, sbs->data[0].bdev); + + ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; + if (ret) + goto err; + + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + iter_size = sizeof(struct sort_iter) + (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); @@ -1541,11 +1548,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) struct printbuf name = PRINTBUF; prt_bdevname(&name, ca->disk_sb.bdev); - - if (c->sb.nr_devices == 1) - strscpy(c->name, name.buf, sizeof(c->name)); strscpy(ca->name, name.buf, sizeof(ca->name)); - printbuf_exit(&name); bch2_rebalance_wakeup(c); @@ -2174,7 +2177,7 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { - DARRAY(struct bch_sb_handle) sbs = { 0 }; + bch_sb_handles sbs = {}; struct bch_fs *c = NULL; struct bch_sb_handle *best = NULL; struct printbuf errbuf = PRINTBUF; @@ -2227,7 +2230,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err_print; } - c = bch2_fs_alloc(best->sb, opts); + c = bch2_fs_alloc(best->sb, opts, &sbs); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; From c02e5b57283ad6fd8dec8d834bd340bf2627fcee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Apr 2025 15:12:49 -0400 Subject: [PATCH 046/218] bcachefs: Single device mode Single device filesystems are now identified by the block device name, not the UUID - and single device filesystems with the same UUID can be mounted simultaneously, without any special options. This allocates a new bit in the superblock, BCH_SB_MULTI_DEVICE, which indicates whether a filesystem has ever been multi device. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/bcachefs_format.h | 2 +- fs/bcachefs/debug.c | 6 +++++- fs/bcachefs/errcode.h | 1 + fs/bcachefs/fs.c | 7 ++++++- fs/bcachefs/opts.h | 4 ++-- fs/bcachefs/super-io.c | 4 ++++ fs/bcachefs/super.c | 30 ++++++++++++++++++++++++------ 8 files changed, 44 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 1e40ad2a7bce..7782e311b6e2 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -780,6 +780,7 @@ struct bch_fs { u8 nr_devices; u8 clean; + bool multi_device; /* true if we've ever had more than one device */ u8 encryption_type; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 6cbc267445b7..22ee49408d11 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -846,7 +846,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); -/* one free bit */ +LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 2c52a2c6502b..312f5ce7cba9 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -933,7 +933,11 @@ void bch2_fs_debug_init(struct bch_fs *c) if (IS_ERR_OR_NULL(bch_debug)) return; - snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + if (c->sb.multi_device) + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + else + strscpy(name, c->name, sizeof(name)); + c->fs_debug_dir = debugfs_create_dir(name, bch_debug); if (IS_ERR_OR_NULL(c->fs_debug_dir)) return; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 768b176f6ea8..051938657cc9 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -201,6 +201,7 @@ x(EINVAL, device_has_been_removed) \ x(EINVAL, device_splitbrain) \ x(EINVAL, device_already_online) \ + x(EINVAL, filesystem_uuid_already_open) \ x(EINVAL, insufficient_devices_to_start) \ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 47f1a64c5c8d..672326693f73 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2514,7 +2514,12 @@ static int bch2_fs_get_tree(struct fs_context *fc) sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); - super_set_sysfs_name_uuid(sb); + + if (c->sb.multi_device) + super_set_sysfs_name_uuid(sb); + else + strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name)); + sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index cbb13e91789d..c97f2a6ad29f 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -518,7 +518,7 @@ enum fsck_err_opts { BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ "types", "Allowed data types for this device: journal, btree, and/or user")\ x(discard, u8, \ - OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \ + OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_MEMBER_DISCARD, true, \ NULL, "Enable discard/TRIM support") \ @@ -526,7 +526,7 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ + NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ " prefetched sequentially") struct bch_opts { diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index adfcd8a92b93..2435e114cad9 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -468,6 +468,9 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); } + if (sb->nr_devices > 1) + SET_BCH_SB_MULTI_DEVICE(sb, true); + if (!flags) { /* * Been seeing a bug where these are getting inexplicably @@ -612,6 +615,7 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.features = le64_to_cpu(src->features[0]); c->sb.compat = le64_to_cpu(src->compat[0]); + c->sb.multi_device = BCH_SB_MULTI_DEVICE(src); memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 75287aa2ae7f..1c3a20d096a3 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -699,9 +699,10 @@ static int bch2_fs_online(struct bch_fs *c) lockdep_assert_held(&bch_fs_list_lock); - if (__bch2_uuid_to_fs(c->sb.uuid)) { + if (c->sb.multi_device && + __bch2_uuid_to_fs(c->sb.uuid)) { bch_err(c, "filesystem UUID already open"); - return -EINVAL; + return -BCH_ERR_filesystem_uuid_already_open; } ret = bch2_fs_chardev_init(c); @@ -712,7 +713,9 @@ static int bch2_fs_online(struct bch_fs *c) bch2_fs_debug_init(c); - ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: + ret = (c->sb.multi_device + ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) + : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT @@ -902,7 +905,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts, goto err; } - if (sbs->nr != 1) + if (c->sb.multi_device) pr_uuid(&name, c->sb.user_uuid.b); else prt_bdevname(&name, sbs->data[0].bdev); @@ -1792,11 +1795,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) int bch2_dev_add(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb; + struct bch_sb_handle sb = {}; struct bch_dev *ca = NULL; struct printbuf errbuf = PRINTBUF; struct printbuf label = PRINTBUF; - int ret; + int ret = 0; ret = bch2_read_super(path, &opts, &sb); bch_err_msg(c, ret, "reading super"); @@ -1813,6 +1816,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path) } } + if (list_empty(&c->list)) { + mutex_lock(&bch_fs_list_lock); + if (__bch2_uuid_to_fs(c->sb.uuid)) + ret = -BCH_ERR_filesystem_uuid_already_open; + else + list_add(&c->list, &bch_fs_list); + mutex_unlock(&bch_fs_list_lock); + + if (ret) { + bch_err(c, "filesystem UUID already open"); + goto err; + } + } + ret = bch2_dev_may_add(sb.sb, c); if (ret) goto err; @@ -1829,6 +1846,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) down_write(&c->state_lock); mutex_lock(&c->sb_lock); + SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); ret = bch2_sb_from_fs(c, ca); bch_err_msg(c, ret, "setting up new superblock"); From 83ecd1b122f49c907ea3c4178f32bd37223e7fac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 08:20:47 -0400 Subject: [PATCH 047/218] bcachefs: Use drop_locks_do() in bch2_inode_hash_find() Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 672326693f73..17a27d6d8c9d 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -352,9 +352,8 @@ static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btre if (!trans) { __wait_on_freeing_inode(c, inode, inum); } else { - bch2_trans_unlock(trans); - __wait_on_freeing_inode(c, inode, inum); - int ret = bch2_trans_relock(trans); + int ret = drop_locks_do(trans, + (__wait_on_freeing_inode(c, inode, inum), 0)); if (ret) return ERR_PTR(ret); } From c79eb06da4c34f29ca8bd23ddf7c1d7c1cd16121 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 09:54:01 -0400 Subject: [PATCH 048/218] bcachefs: Clean up option pre/post hooks, small fixes The helpers are now: - bch2_opt_hook_pre_set() - bch2_opts_hooks_pre_set() - bch2_opt_hook_post_set Fix a bug where the filesystem discard option would incorrectly be changed when setting the device option, and don't trigger rebalance scans unnecessarily (when options aren't changing). Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.c | 81 +++++++++++++++++++++++++++++++++++++++------ fs/bcachefs/opts.h | 11 +++--- fs/bcachefs/super.c | 2 +- fs/bcachefs/sysfs.c | 29 ++++------------ fs/bcachefs/xattr.c | 2 +- 5 files changed, 86 insertions(+), 39 deletions(-) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index b3fcffc91d6f..386482ff8e7b 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -7,7 +7,9 @@ #include "compress.h" #include "disk_groups.h" #include "error.h" +#include "movinggc.h" #include "opts.h" +#include "rebalance.h" #include "recovery_passes.h" #include "super-io.h" #include "util.h" @@ -516,7 +518,7 @@ void bch2_opts_to_text(struct printbuf *out, } } -int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) +int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) { int ret = 0; @@ -534,15 +536,17 @@ int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) if (v) bch2_check_set_feature(c, BCH_FEATURE_ec); break; + default: + break; } return ret; } -int bch2_opts_check_may_set(struct bch_fs *c) +int bch2_opts_hooks_pre_set(struct bch_fs *c) { for (unsigned i = 0; i < bch2_opts_nr; i++) { - int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); + int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } @@ -550,6 +554,52 @@ int bch2_opts_check_may_set(struct bch_fs *c) return 0; } +void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, + struct bch_opts *new_opts, enum bch_opt_id id) +{ + switch (id) { + case Opt_foreground_target: + if (new_opts->foreground_target && + !new_opts->background_target) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_compression: + if (new_opts->compression && + !new_opts->background_compression) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_background_target: + if (new_opts->background_target) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_background_compression: + if (new_opts->background_compression) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_rebalance_enabled: + bch2_rebalance_wakeup(c); + break; + case Opt_copygc_enabled: + bch2_copygc_wakeup(c); + break; + case Opt_discard: + if (!ca) { + mutex_lock(&c->sb_lock); + for_each_member_device(c, ca) { + struct bch_member *m = + bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_DISCARD(m, c->opts.discard); + } + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + break; + default: + break; + } +} + int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, struct printbuf *parse_later, const char *name, const char *val) @@ -709,9 +759,11 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) return 0; } -void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, +bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, const struct bch_option *opt, u64 v) { + bool changed = false; + if (opt->flags & OPT_SB_FIELD_SECTORS) v >>= 9; @@ -721,26 +773,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, if (opt->flags & OPT_SB_FIELD_ONE_BIAS) v++; - if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) + if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) { + changed = v != opt->get_sb(sb); + opt->set_sb(sb, v); + } if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { if (WARN(!bch2_member_exists(sb, dev_idx), "tried to set device option %s on nonexistent device %i", opt->attr.name, dev_idx)) - return; + return false; - opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v); + struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); + changed = v != opt->get_member(m); + opt->set_member(m, v); } + + return changed; } -void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, +bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, const struct bch_option *opt, u64 v) { mutex_lock(&c->sb_lock); - __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); - bch2_write_super(c); + bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); + if (changed) + bch2_write_super(c); mutex_unlock(&c->sb_lock); + return changed; } /* io opts: */ diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index c97f2a6ad29f..b7952405d502 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -612,10 +612,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); -void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); +bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); struct bch_dev; -void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); +bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); @@ -632,8 +632,11 @@ void bch2_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); -int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64); -int bch2_opts_check_may_set(struct bch_fs *); +int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); +int bch2_opts_hooks_pre_set(struct bch_fs *); +void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, + struct bch_opts *, enum bch_opt_id); + int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 1c3a20d096a3..65aab7ea182e 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1130,7 +1130,7 @@ int bch2_fs_start(struct bch_fs *c) if (ret) goto err; - ret = bch2_opts_check_may_set(c); + ret = bch2_opts_hooks_pre_set(c); if (ret) goto err; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 82ee333ddd21..bfdadeae970e 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -637,36 +637,19 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, u64 v; ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: - bch2_opt_check_may_set(c, ca, id, v); + bch2_opt_hook_pre_set(c, ca, id, v); kfree(tmp); if (ret < 0) goto err; - bch2_opt_set_sb(c, ca, opt, v); - bch2_opt_set_by_id(&c->opts, id, v); + bool changed = bch2_opt_set_sb(c, ca, opt, v); - if (v && - (id == Opt_background_target || - (id == Opt_foreground_target && !c->opts.background_target) || - id == Opt_background_compression || - (id == Opt_compression && !c->opts.background_compression))) - bch2_set_rebalance_needs_scan(c, 0); + if (!ca) + bch2_opt_set_by_id(&c->opts, id, v); - if (v && id == Opt_rebalance_enabled) - bch2_rebalance_wakeup(c); - - if (v && id == Opt_copygc_enabled) - bch2_copygc_wakeup(c); - - if (id == Opt_discard && !ca) { - mutex_lock(&c->sb_lock); - for_each_member_device(c, ca) - opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } + if (changed) + bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); ret = size; err: diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index e6be32003f3b..423ace6272be 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -529,7 +529,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, if (ret < 0) goto err_class_exit; - ret = bch2_opt_check_may_set(c, NULL, opt_id, v); + ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); if (ret < 0) goto err_class_exit; From 5022d0e18394c1c1cd41b5aae9ae6056b49ce678 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 10:20:46 -0400 Subject: [PATCH 049/218] bcachefs: Incompatible features may now be enabled at runtime version_upgrade is now a runtime option. In the future we'll want to add compatible upgrades at runtime, and call the full check_version_upgrade() when the option changes, but we don't have compatible optional upgrades just yet. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.c | 9 +++++++++ fs/bcachefs/opts.h | 2 +- fs/bcachefs/recovery.c | 4 ++-- fs/bcachefs/super-io.c | 25 +++++++++++++++++++++++++ fs/bcachefs/super-io.h | 1 + 5 files changed, 38 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 386482ff8e7b..b1cf88905b81 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -595,6 +595,15 @@ void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, mutex_unlock(&c->sb_lock); } break; + case Opt_version_upgrade: + /* + * XXX: in the future we'll likely want to do compatible + * upgrades at runtime as well, but right now there's nothing + * that does that: + */ + if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) + bch2_sb_upgrade_incompat(c); + break; default: break; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index b7952405d502..b8cd0b04e62a 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -450,7 +450,7 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, false, \ NULL, "Reconstruct alloc btree") \ x(version_upgrade, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_version_upgrade_opts), \ BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ NULL, "Set superblock to latest version,\n" \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d6c4ef819d40..4c336f20d5eb 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -667,7 +667,7 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } - bch_info(c, "%s", buf.buf); + bch_notice(c, "%s", buf.buf); printbuf_exit(&buf); ret = true; @@ -683,7 +683,7 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_version_to_text(&buf, c->sb.version_incompat_allowed); prt_newline(&buf); - bch_info(c, "%s", buf.buf); + bch_notice(c, "%s", buf.buf); printbuf_exit(&buf); ret = true; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 2435e114cad9..8ebc5e3f3ea3 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1274,6 +1274,31 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) } } +void bch2_sb_upgrade_incompat(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + if (c->sb.version == c->sb.version_incompat_allowed) + goto unlock; + + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Now allowing incompatible features up to "); + bch2_version_to_text(&buf, c->sb.version); + prt_str(&buf, ", previously allowed up to "); + bch2_version_to_text(&buf, c->sb.version_incompat_allowed); + prt_newline(&buf); + + bch_notice(c, "%s", buf.buf); + printbuf_exit(&buf); + + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); + bch2_write_super(c); +unlock: + mutex_unlock(&c->sb_lock); +} + static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 78f708a6fbcd..a3b7a90f2533 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -107,6 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) bool bch2_check_version_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); +void bch2_sb_upgrade_incompat(struct bch_fs *); void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); From bb36a12921e5fc76f3b26a80ab0217d3dc62a473 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 13:45:39 -0400 Subject: [PATCH 050/218] bcachefs: bch2_run_explicit_recovery_pass_printbuf() We prefer helpers that emit log messages to printbufs rather than printing them directly; that way, we can ensure that different log messages from the same event are grouped together and formatted appropriately in the dmesg log. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 33 ++++++++++++++++++--------- fs/bcachefs/error.c | 2 +- fs/bcachefs/recovery_passes.c | 42 ++++++++++++++++++++++++++--------- fs/bcachefs/recovery_passes.h | 3 +++ 4 files changed, 59 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 31fbc2716d8b..ffe957602cca 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -400,7 +400,8 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf __bch2_count_fsck_err(c, id, buf->buf, &repeat, &print, &suppress); - int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + int ret = bch2_run_explicit_recovery_pass_printbuf(c, buf, + BCH_RECOVERY_PASS_check_allocations); if (insert) { print = true; @@ -966,14 +967,27 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, bucket_metadata_type_mismatch, - "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - bch2_data_type_str(type), - bch2_data_type_str(type)); + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s\n", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_type_str(a->v.data_type), + bch2_data_type_str(type), + bch2_data_type_str(type)); + + bool repeat = false, print = true, suppress = false; + bch2_count_fsck_err(c, bucket_metadata_type_mismatch, buf.buf, + &repeat, &print, &suppress); + + bch2_run_explicit_recovery_pass_printbuf(c, &buf, + BCH_RECOVERY_PASS_check_allocations); + + if (suppress) + prt_printf(&buf, "Ratelimiting new instances of previous error\n"); + if (print) + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_metadata_bucket_inconsistency; goto err; } @@ -985,7 +999,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, &a->k_i, 0); } err: -fsck_err: bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 6b8695b1349c..faeadffa1103 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -104,7 +104,7 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) __bch2_inconsistent_error(c, out); return -BCH_ERR_btree_need_topology_repair; } else { - return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: + return bch2_run_explicit_recovery_pass_printbuf(c, out, BCH_RECOVERY_PASS_check_topology) ?: -BCH_ERR_btree_node_read_validate_error; } } diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 22f72bb5b853..946428daeecc 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -101,7 +101,8 @@ u64 bch2_recovery_passes_from_stable(u64 v) /* * For when we need to rewind recovery passes and run a pass we skipped: */ -static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, +static int __bch2_run_explicit_recovery_pass(struct printbuf *out, + struct bch_fs *c, enum bch_recovery_pass pass) { if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) @@ -115,15 +116,15 @@ static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, if (pass < BCH_RECOVERY_PASS_set_may_go_rw && c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { if (print) - bch_info(c, "need recovery pass %s (%u), but already rw", - bch2_recovery_passes[pass], pass); + prt_printf(out, "need recovery pass %s (%u), but already rw", + bch2_recovery_passes[pass], pass); return -BCH_ERR_cannot_rewind_recovery; } if (print) - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + prt_printf(out, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); c->opts.recovery_passes |= BIT_ULL(pass); @@ -136,13 +137,34 @@ static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, } } +int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass) +{ + bch2_printbuf_make_room(out, 1024); + out->atomic++; + + unsigned long flags; + spin_lock_irqsave(&c->recovery_pass_lock, flags); + int ret = __bch2_run_explicit_recovery_pass(out, c, pass); + spin_unlock_irqrestore(&c->recovery_pass_lock, flags); + + --out->atomic; + return ret; +} + int bch2_run_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - unsigned long flags; - spin_lock_irqsave(&c->recovery_pass_lock, flags); - int ret = __bch2_run_explicit_recovery_pass(c, pass); - spin_unlock_irqrestore(&c->recovery_pass_lock, flags); + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + unsigned len = buf.pos; + + int ret = bch2_run_explicit_recovery_pass_printbuf(c, &buf, pass); + + if (len != buf.pos) + bch2_print_string_as_lines(KERN_NOTICE, buf.buf); + printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 7d7339c8fa29..e19a8aaba2f8 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -8,6 +8,9 @@ u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); +int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *, + struct printbuf *, + enum bch_recovery_pass); int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); From 2085325171f2f2d33a94101e58266f325c286e95 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 13:55:16 -0400 Subject: [PATCH 051/218] bcachefs: Simplify bch2_count_fsck_err() Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 16 +++------------- fs/bcachefs/error.c | 14 ++++++++++---- fs/bcachefs/error.h | 4 +--- fs/bcachefs/fs-io.c | 7 ++----- fs/bcachefs/io_write.c | 4 +--- 5 files changed, 17 insertions(+), 28 deletions(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index ffe957602cca..8e64077c15c1 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -392,29 +392,23 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf struct bkey_s_c k, bool insert, enum bch_sb_error_id id) { struct bch_fs *c = trans->c; - bool repeat = false, print = true, suppress = false; prt_printf(buf, "\nwhile marking "); bch2_bkey_val_to_text(buf, c, k); prt_newline(buf); - __bch2_count_fsck_err(c, id, buf->buf, &repeat, &print, &suppress); + bool print = __bch2_count_fsck_err(c, id, buf); int ret = bch2_run_explicit_recovery_pass_printbuf(c, buf, BCH_RECOVERY_PASS_check_allocations); if (insert) { - print = true; - suppress = false; - bch2_trans_updates_to_text(buf, trans); __bch2_inconsistent_error(c, buf); ret = -BCH_ERR_bucket_ref_update; } - if (suppress) - prt_printf(buf, "Ratelimiting new instances of previous error\n"); - if (print) + if (print || insert) bch2_print_string_as_lines(KERN_ERR, buf->buf); return ret; } @@ -976,15 +970,11 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bch2_data_type_str(type), bch2_data_type_str(type)); - bool repeat = false, print = true, suppress = false; - bch2_count_fsck_err(c, bucket_metadata_type_mismatch, buf.buf, - &repeat, &print, &suppress); + bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); bch2_run_explicit_recovery_pass_printbuf(c, &buf, BCH_RECOVERY_PASS_check_allocations); - if (suppress) - prt_printf(&buf, "Ratelimiting new instances of previous error\n"); if (print) bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index faeadffa1103..4627aabd1f1a 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -376,15 +376,21 @@ static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, return s; } -void __bch2_count_fsck_err(struct bch_fs *c, - enum bch_sb_error_id id, const char *msg, - bool *repeat, bool *print, bool *suppress) +bool __bch2_count_fsck_err(struct bch_fs *c, + enum bch_sb_error_id id, struct printbuf *msg) { bch2_sb_error_count(c, id); mutex_lock(&c->fsck_error_msgs_lock); - count_fsck_err_locked(c, id, msg, repeat, print, suppress); + bool print = true, repeat = false, suppress = false; + + count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress); mutex_unlock(&c->fsck_error_msgs_lock); + + if (suppress) + prt_printf(msg, "Ratelimiting new instances of previous error\n"); + + return print && !repeat; } int __bch2_fsck_err(struct bch_fs *c, diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 4a364fd44abe..0b3ede1c2015 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -76,9 +76,7 @@ struct fsck_err_state { #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) -void __bch2_count_fsck_err(struct bch_fs *, - enum bch_sb_error_id, const char *, - bool *, bool *, bool *); +bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *); #define bch2_count_fsck_err(_c, _err, ...) \ __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 9657144666b8..b81117b51c69 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -151,8 +151,7 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, inode->v.i_ino, (u64) inode->v.i_blocks, sectors, inode->ei_inode.bi_sectors); - bool repeat = false, print = false, suppress = false; - bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, buf.buf, &repeat, &print, &suppress); + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); if (print) bch2_print_str(c, buf.buf); printbuf_exit(&buf); @@ -526,9 +525,7 @@ int bchfs_truncate(struct mnt_idmap *idmap, inode->v.i_ino, (u64) inode->v.i_blocks, inode->ei_inode.bi_sectors); - bool repeat = false, print = false, suppress = false; - bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, buf.buf, - &repeat, &print, &suppress); + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); if (print) bch2_print_str(c, buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 401347e135b7..c738ae6fd9a5 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -263,9 +263,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", extent_iter->pos.inode, bi_sectors, i_sectors_delta); - bool repeat = false, print = false, suppress = false; - bch2_count_fsck_err(c, inode_i_sectors_underflow, buf.buf, - &repeat, &print, &suppress); + bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); if (print) bch2_print_str(c, buf.buf); printbuf_exit(&buf); From 040c762152f5f4fb1b13e8a46c17ecb4e670d96d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 14:08:42 -0400 Subject: [PATCH 052/218] bcachefs: bch2_dev_missing_bkey() Part of the ongoing project to kill off bch2_(fs|trans)_inconsistent calls - they generally need to be replaced with either - a fsck_err() call that can repair the error, or - logging an error of the appropriate type in the superblock, and flagging the appropriate recovery pass to repair the error Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 10 +++++++++- fs/bcachefs/sb-members.c | 22 +++++++++++++++++++++- fs/bcachefs/sb-members.h | 8 +++++--- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index a369f978ffe6..ef116c55f0a7 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -158,7 +158,15 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (dev >= 0 && p.ptr.dev != dev) continue; - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); + + if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) { + rcu_read_unlock(); + int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); + if (ret) + return ret; + rcu_read_lock(); + } if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 72779912939b..f776b00c3cc0 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -5,11 +5,31 @@ #include "disk_groups.h" #include "error.h" #include "opts.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-members.h" #include "super-io.h" -void bch2_dev_missing(struct bch_fs *c, unsigned dev) +int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) +{ + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev); + bch2_bkey_val_to_text(&buf, c, k); + + bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); + + int ret = bch2_run_explicit_recovery_pass_printbuf(c, &buf, + BCH_RECOVERY_PASS_check_allocations); + + if (print) + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + return ret; +} + +void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) { if (dev != BCH_SB_MEMBER_INVALID) bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 42786657522c..0f1741fffcb6 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -218,13 +218,15 @@ static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned de : NULL; } -void bch2_dev_missing(struct bch_fs *, unsigned); +int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned); + +void bch2_dev_missing_atomic(struct bch_fs *, unsigned); static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) { struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); if (unlikely(!ca)) - bch2_dev_missing(c, dev); + bch2_dev_missing_atomic(c, dev); return ca; } @@ -242,7 +244,7 @@ static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); if (unlikely(!ca)) - bch2_dev_missing(c, dev); + bch2_dev_missing_atomic(c, dev); return ca; } From ebf561b2083d797da4673207044855ccd764195b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 17:31:47 -0400 Subject: [PATCH 053/218] bcachefs: print_str_as_lines() -> print_str() bch2_print_string_as_lines() is a low level helper that allows messages longer than 1k to be printed without truncation. But we should always be printing with the helpers that take a filesystem object, if we're in fsck they direct output to the userspace process controlling fsck instead of the dmesg log. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 2 +- fs/bcachefs/bcachefs.h | 3 ++- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/btree_iter.c | 4 ++-- fs/bcachefs/btree_locking.c | 2 +- fs/bcachefs/btree_node_scan.c | 6 +++--- fs/bcachefs/btree_update_interior.c | 2 +- fs/bcachefs/buckets.c | 6 +++--- fs/bcachefs/data_update.c | 2 +- fs/bcachefs/error.c | 10 +++++----- fs/bcachefs/fs-io.c | 4 ++-- fs/bcachefs/io_write.c | 2 +- fs/bcachefs/journal.c | 4 ++-- fs/bcachefs/journal_io.c | 2 +- fs/bcachefs/recovery_passes.c | 2 +- fs/bcachefs/sb-members.c | 2 +- fs/bcachefs/super.c | 15 +++++++++++++-- fs/bcachefs/util.c | 14 ++------------ fs/bcachefs/util.h | 3 +-- 19 files changed, 44 insertions(+), 43 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 31d2207a071b..d56cee7e8cb5 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1664,7 +1664,7 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) bch2_journal_debug_to_text(&buf, &c->journal); printbuf_indent_sub(&buf, 2); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 7782e311b6e2..4fd096349790 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -269,7 +269,8 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") -void bch2_print_str(struct bch_fs *, const char *); +void bch2_print_str(struct bch_fs *, const char *, const char *); +void bch2_print_str_nonblocking(struct bch_fs *, const char *, const char *); __printf(2, 3) void bch2_print_opts(struct bch_opts *, const char *, ...); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 4832ac31392a..b6f5e0dfc9f1 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -605,7 +605,7 @@ static int __btree_err(int ret, } if (!silent) - bch2_print_string_as_lines(KERN_ERR, out.buf); + bch2_print_str(c, KERN_ERR, out.buf); out: fsck_err: printbuf_exit(&out); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index cfd6363dfc39..bd3a0bc07511 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1591,7 +1591,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) __bch2_trans_paths_to_text(&buf, trans, nosort); bch2_trans_updates_to_text(&buf, trans); - bch2_print_str(trans->c, buf.buf); + bch2_print_str(trans->c, KERN_ERR, buf.buf); printbuf_exit(&buf); } @@ -3121,7 +3121,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long #ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE struct printbuf buf = PRINTBUF; bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); #endif } diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index f4f563944340..baa505a9a706 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -236,7 +236,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, prt_newline(&buf); } - bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); + bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf); printbuf_exit(&buf); BUG(); } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 86acf037590c..81ee7ae88a77 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -395,7 +395,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); @@ -424,7 +424,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } swap(nodes_heap, f->nodes); @@ -470,7 +470,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } else { bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 00307356d7c8..2be7c10fc59c 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1807,7 +1807,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_update_to_text(&buf, as); bch2_btree_path_to_text(&buf, trans, path_idx); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); bch2_fs_emergency_read_only(c); return -EIO; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 8e64077c15c1..36c1e391d4df 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -409,7 +409,7 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf } if (print || insert) - bch2_print_string_as_lines(KERN_ERR, buf->buf); + bch2_print_str(c, KERN_ERR, buf->buf); return ret; } @@ -706,7 +706,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, (u64) p.ec.idx); bch2_bkey_val_to_text(&buf, c, k); __bch2_inconsistent_error(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); return -BCH_ERR_trigger_stripe_pointer; } @@ -976,7 +976,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, BCH_RECOVERY_PASS_check_allocations); if (print) - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); ret = -BCH_ERR_metadata_bucket_inconsistency; goto err; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index b211c97238ab..c3034338f9e4 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -358,7 +358,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, prt_str(&buf, "\nnew: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); bch2_fatal_error(c); diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 4627aabd1f1a..20495062d6e1 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -34,7 +34,7 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: - bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf); + bch2_print_str(c, KERN_ERR, out->buf); panic(bch2_fmt(c, "panic after error")); return true; default: @@ -71,7 +71,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra if (trans) bch2_trans_updates_to_text(&buf, trans); bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); + bch2_print_str_nonblocking(c, KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; @@ -121,7 +121,7 @@ int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) va_end(args); int ret = __bch2_topology_error(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; @@ -328,7 +328,7 @@ static int do_fsck_ask_yn(struct bch_fs *c, if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s", question->buf); else - bch2_print_string_as_lines(KERN_ERR, question->buf); + bch2_print_str(c, KERN_ERR, question->buf); int ask = bch2_fsck_ask_yn(c, trans); @@ -565,7 +565,7 @@ int __bch2_fsck_err(struct bch_fs *c, if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s", out->buf); else - bch2_print_string_as_lines(KERN_ERR, out->buf); + bch2_print_str(c, KERN_ERR, out->buf); } if (s) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index b81117b51c69..7200ec00128d 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -153,7 +153,7 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); if (print) - bch2_print_str(c, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); if (sectors < 0) @@ -527,7 +527,7 @@ int bchfs_truncate(struct mnt_idmap *idmap, bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); if (print) - bch2_print_str(c, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index c738ae6fd9a5..38086c1a8e28 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -265,7 +265,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); if (print) - bch2_print_str(c, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); if (i_sectors_delta < 0) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index a51ad32931b8..5442d526a448 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -173,7 +173,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) spin_unlock(&j->lock); prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), bch2_err_str(error)); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_reset(&buf); bch2_journal_pins_to_text(&buf, j); @@ -743,7 +743,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, struct printbuf buf = PRINTBUF; bch2_journal_debug_to_text(&buf, j); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); printbuf_exit(&buf); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index ded18a94ed02..28fa381cd589 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -2107,7 +2107,7 @@ CLOSURE_CALLBACK(bch2_journal_write) le64_to_cpu(w->data->seq), vstruct_sectors(w->data, c->block_bits), bch2_err_str(ret)); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } if (ret) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 946428daeecc..de1a14c4bc3c 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -163,7 +163,7 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, int ret = bch2_run_explicit_recovery_pass_printbuf(c, &buf, pass); if (len != buf.pos) - bch2_print_string_as_lines(KERN_NOTICE, buf.buf); + bch2_print_str(c, KERN_NOTICE, buf.buf); printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index f776b00c3cc0..77809ee23c45 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -24,7 +24,7 @@ int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) BCH_RECOVERY_PASS_check_allocations); if (print) - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 65aab7ea182e..b34c91dd51b1 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -84,7 +84,8 @@ const char * const bch2_fs_flag_strs[] = { NULL }; -void bch2_print_str(struct bch_fs *c, const char *str) +static void __bch2_print_str(struct bch_fs *c, const char *prefix, + const char *str, bool nonblocking) { #ifdef __KERNEL__ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); @@ -94,7 +95,17 @@ void bch2_print_str(struct bch_fs *c, const char *str) return; } #endif - bch2_print_string_as_lines(KERN_ERR, str); + bch2_print_string_as_lines(KERN_ERR, str, nonblocking); +} + +void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) +{ + __bch2_print_str(c, prefix, str, false); +} + +void bch2_print_str_nonblocking(struct bch_fs *c, const char *prefix, const char *str) +{ + __bch2_print_str(c, prefix, str, true); } __printf(2, 0) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 87af551692f4..6e5d7fc265bd 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -252,8 +252,8 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v) bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); } -static void __bch2_print_string_as_lines(const char *prefix, const char *lines, - bool nonblocking) +void bch2_print_string_as_lines(const char *prefix, const char *lines, + bool nonblocking) { bool locked = false; const char *p; @@ -281,16 +281,6 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, console_unlock(); } -void bch2_print_string_as_lines(const char *prefix, const char *lines) -{ - return __bch2_print_string_as_lines(prefix, lines, false); -} - -void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines) -{ - return __bch2_print_string_as_lines(prefix, lines, true); -} - int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, gfp_t gfp) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index ccc1cf699c4b..50f7197c67fc 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -212,8 +212,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]); void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); -void bch2_print_string_as_lines(const char *prefix, const char *lines); -void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines); +void bch2_print_string_as_lines(const char *, const char *, bool); typedef DARRAY(unsigned long) bch_stacktrace; int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); From bdad8962c94d3e557317b3d0691a507177f27a22 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 19:15:43 -0400 Subject: [PATCH 054/218] bcachefs: Flag for repair on missing subvolume Instead of going emegency read only with a bch2_fs_inconsistent() call, log the error and recovery pass appropriately. If we're still in recovery it'll be repaired immediately, otherwise it'll be repaired on the next mount. Signed-off-by: Kent Overstreet --- fs/bcachefs/subvolume.c | 43 +++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 0421ffc1128f..c9d7209f0cb1 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -14,6 +14,22 @@ static int bch2_subvolume_delete(struct btree_trans *, u32); +static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) +{ + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "missing subvolume %u", subvolid); + bool print = bch2_count_fsck_err(c, subvol_missing, &buf); + + int ret = bch2_run_explicit_recovery_pass_printbuf(c, &buf, + BCH_RECOVERY_PASS_check_inodes); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + return ret; +} + static struct bpos subvolume_children_pos(struct bkey_s_c k) { if (k.k->type != KEY_TYPE_subvolume) @@ -292,9 +308,8 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), BTREE_ITER_cached| BTREE_ITER_with_updates, subvolume, s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && - inconsistent_if_not_found, - trans->c, "missing subvolume %u", subvol); + if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found) + ret = bch2_subvolume_missing(trans->c, subvol) ?: ret; return ret; } @@ -344,8 +359,8 @@ int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, subvolume); ret = bkey_err(subvol); - bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; if (likely(!ret)) *snapid = le32_to_cpu(subvol.v->snapshot); @@ -418,8 +433,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) BTREE_ITER_cached|BTREE_ITER_intent, subvolume); int ret = bkey_err(subvol); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; if (ret) goto err; @@ -553,11 +568,10 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) BTREE_ID_subvolumes, POS(0, subvolid), BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(n); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; + if (unlikely(ret)) return ret; - } SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); n->v.fs_path_parent = 0; @@ -596,11 +610,10 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, BTREE_ID_subvolumes, POS(0, src_subvolid), BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(src_subvol); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "subvolume %u not found", src_subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret; + if (unlikely(ret)) goto err; - } parent = le32_to_cpu(src_subvol->v.snapshot); } From d12bd4101825c14222ecbe4c6fba9c03ce42f624 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 19:21:52 -0400 Subject: [PATCH 055/218] bcachefs: Add a recovery pass for making sure root inode is readable If the root inode/subvolume is unreadable we can repair automatically - but only if we're still in recovery, so that we can rewind to the appropriate recovery pass. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 15 +++++++++++++++ fs/bcachefs/recovery_passes_types.h | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index de1a14c4bc3c..b4de21f80811 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -52,6 +52,21 @@ static int bch2_set_may_go_rw(struct bch_fs *c) return 0; } +/* + * Make sure root inode is readable while we're still in recovery and can rewind + * for repair: + */ +static int bch2_lookup_root_inode(struct bch_fs *c) +{ + subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + + return bch2_trans_do(c, + bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); +} + struct recovery_pass_fn { int (*fn)(struct bch_fs *); unsigned when; diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index e89b9c783285..4671ccf2d560 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -60,7 +60,8 @@ x(resume_logged_ops, 23, PASS_ALWAYS) \ x(delete_dead_inodes, 32, PASS_ALWAYS) \ x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) + x(set_fs_needs_rebalance, 34, 0) \ + x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT) /* We normally enumerate recovery passes in the order we run them: */ enum bch_recovery_pass { From 1c8dfd7ba50dbbb72113caf4fa7868512cdad2f4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 23:35:48 -0400 Subject: [PATCH 056/218] bcachefs: sb_validate() no longer requires members_v1 Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 8ebc5e3f3ea3..872707e5fa95 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -384,7 +384,6 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, enum bch_validate_flags flags, struct printbuf *out) { - struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; int ret; @@ -539,14 +538,17 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, } } + struct bch_sb_field *mi = + bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?: + bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1); + /* members must be validated first: */ - mi = bch2_sb_field_get(sb, members_v1); if (!mi) { prt_printf(out, "Invalid superblock: member info area missing"); return -BCH_ERR_invalid_sb_members_missing; } - ret = bch2_sb_field_validate(sb, &mi->field, flags, out); + ret = bch2_sb_field_validate(sb, mi, flags, out); if (ret) return ret; From 0dc73809e93aeb905acf9fa88502c73534cfa83d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 16 Apr 2025 06:48:31 -0400 Subject: [PATCH 057/218] bcachefs: Shrink superblock downgrade table Don't generate entries for versions that won't be able to mount. Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-downgrade.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index badd0e17ada5..296c6c925386 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -374,6 +374,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c) if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) continue; + if (src->version < c->sb.version_incompat) + continue; + struct bch_sb_field_downgrade_entry *dst; unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors; From 576493133f26a172b8db4313448206d30750c9b2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 16 Apr 2025 09:23:15 -0400 Subject: [PATCH 058/218] bcachefs: Print features on startup with -o verbose Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index b34c91dd51b1..60e632e22b98 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1054,6 +1054,11 @@ static void print_mount_opts(struct bch_fs *c) bch2_version_to_text(&p, c->sb.version_incompat_allowed); } + if (c->opts.verbose) { + prt_printf(&p, "\n features: "); + prt_bitflags(&p, bch2_sb_features, c->sb.features); + } + bch_info(c, "%s", p.buf); printbuf_exit(&p); } From 203852d9db68e14b50b119cbd123def7e7c9efd0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 14:19:23 -0400 Subject: [PATCH 059/218] bcachefs: BCH_FEATURE_no_alloc_info If a filesystem is going to only be used read-only, and will be a deployable image, we can strip out alloc info for a substantial reduction in metadata size - around half, due to backpointers. Alloc info will be regenerated on first read-write mount. Remounting RW is disallowed for now, since we don't yet have check_allocations running in RW mode. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/errcode.h | 1 + fs/bcachefs/recovery.c | 28 ++++++++++++++++++++++------ fs/bcachefs/recovery.h | 1 + fs/bcachefs/recovery_passes.c | 14 +++++++++++++- fs/bcachefs/recovery_passes_types.h | 18 ++++++++++-------- fs/bcachefs/sb-members.c | 6 ++++++ fs/bcachefs/super.c | 6 ++++++ 8 files changed, 61 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 22ee49408d11..c0041391e2e8 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -923,7 +923,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(alloc_v2, 17) \ x(extents_across_btree_nodes, 18) \ x(incompat_version_field, 19) \ - x(casefolding, 20) + x(casefolding, 20) \ + x(no_alloc_info, 21) #define BCH_SB_FEATURES_ALWAYS \ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 051938657cc9..8a4435660d86 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -221,6 +221,7 @@ x(EROFS, erofs_unfixed_errors) \ x(EROFS, erofs_norecovery) \ x(EROFS, erofs_nochanges) \ + x(EROFS, erofs_no_alloc_info) \ x(EROFS, insufficient_devices) \ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 4c336f20d5eb..b5ab77f3c692 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -33,7 +33,6 @@ #include #include - int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) { u64 b = BIT_ULL(btree); @@ -114,11 +113,8 @@ static void kill_btree(struct bch_fs *c, enum btree_id btree) } /* for -o reconstruct_alloc: */ -static void bch2_reconstruct_alloc(struct bch_fs *c) +void bch2_reconstruct_alloc(struct bch_fs *c) { - bch2_journal_log_msg(c, "dropping alloc info"); - bch_info(c, "dropping and reconstructing all alloc info"); - mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); @@ -160,6 +156,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info)); + bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -889,8 +887,26 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; - if (c->opts.reconstruct_alloc) + if (!c->opts.read_only && + (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { + bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); + bch2_reconstruct_alloc(c); + } else if (c->opts.reconstruct_alloc) { + bch2_journal_log_msg(c, "dropping alloc info"); + bch_info(c, "dropping and reconstructing all alloc info"); + + bch2_reconstruct_alloc(c); + } + + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { + /* We can't go RW to fix errors without alloc info */ + if (c->opts.fix_errors == FSCK_FIX_yes || + c->opts.fix_errors == FSCK_FIX_ask) + c->opts.fix_errors = FSCK_FIX_no; + if (c->opts.errors == BCH_ON_ERROR_fix_safe) + c->opts.errors = BCH_ON_ERROR_continue; + } /* * After an unclean shutdown, skip then next few journal sequence diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index b0d55754b21b..d858ba674eaa 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -3,6 +3,7 @@ #define _BCACHEFS_RECOVERY_H int bch2_btree_lost_data(struct bch_fs *, enum btree_id); +void bch2_reconstruct_alloc(struct bch_fs *); int bch2_journal_replay(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index b4de21f80811..87150dd30f4b 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -47,8 +47,18 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) + if (keys->nr || + !c->opts.read_only || + !c->sb.clean || + c->opts.recovery_passes || + (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))) { + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { + bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); + bch2_reconstruct_alloc(c); + } + return bch2_fs_read_write_early(c); + } return 0; } @@ -240,6 +250,8 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa { struct recovery_pass_fn *p = recovery_pass_fns + pass; + if ((p->when & PASS_ALLOC) && (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) + return false; if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) return false; if (c->opts.recovery_passes & BIT_ULL(pass)) diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 4671ccf2d560..f9d565bb50dd 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -7,6 +7,8 @@ #define PASS_UNCLEAN BIT(2) #define PASS_ALWAYS BIT(3) #define PASS_ONLINE BIT(4) +#define PASS_ALLOC BIT(5) +#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC) #ifdef CONFIG_BCACHEFS_DEBUG #define PASS_FSCK_DEBUG BIT(1) @@ -27,17 +29,17 @@ x(stripes_read, 1, 0) \ x(initialize_subvolumes, 2, 0) \ x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_allocations, 5, PASS_FSCK) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ + x(check_allocations, 5, PASS_FSCK_ALLOC) \ + x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ + x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ x(bucket_gens_init, 17, 0) \ x(reconstruct_snapshots, 38, 0) \ diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 77809ee23c45..39ce94875dde 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -190,6 +190,12 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } + if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) && + sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) { + prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i); + return -BCH_ERR_invalid_sb_members; + } + return 0; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 60e632e22b98..6ab3e63ef139 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -443,6 +443,9 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) + return -BCH_ERR_erofs_no_alloc_info; + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; @@ -535,6 +538,9 @@ int bch2_fs_read_write(struct bch_fs *c) if (c->opts.nochanges) return -BCH_ERR_erofs_nochanges; + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) + return -BCH_ERR_erofs_no_alloc_info; + return __bch2_fs_read_write(c, false); } From 530112d88ebd7405fb61711892b2a680048984c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 15:15:36 -0400 Subject: [PATCH 060/218] bcachefs: BCH_FEATURE_small_image We can't go RW if it's an image file that hasn't been resized. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 8 +++++--- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/errcode.h | 1 + fs/bcachefs/journal.c | 9 ++++++++- fs/bcachefs/journal_reclaim.c | 22 ++++++++++++---------- fs/bcachefs/recovery.c | 5 +++++ fs/bcachefs/super.c | 5 +++++ 7 files changed, 38 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 8b8c2344855f..6ac8bd49c629 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2392,14 +2392,16 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, int bch2_fs_freespace_init(struct bch_fs *c) { - int ret = 0; - bool doing_init = false; + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) + return 0; + /* * We can crash during the device add path, so we need to check this on * every mount: */ + bool doing_init = false; for_each_member_device(c, ca) { if (ca->mi.freespace_initialized) continue; @@ -2409,7 +2411,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) doing_init = true; } - ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); + int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { bch2_dev_put(ca); bch_err_fn(c, ret); diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index c0041391e2e8..7ce475c565b5 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -924,7 +924,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(extents_across_btree_nodes, 18) \ x(incompat_version_field, 19) \ x(casefolding, 20) \ - x(no_alloc_info, 21) + x(no_alloc_info, 21) \ + x(small_image, 22) #define BCH_SB_FEATURES_ALWAYS \ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 8a4435660d86..6a4b3fe9ea99 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -222,6 +222,7 @@ x(EROFS, erofs_norecovery) \ x(EROFS, erofs_nochanges) \ x(EROFS, erofs_no_alloc_info) \ + x(EROFS, erofs_filesystem_full) \ x(EROFS, insufficient_devices) \ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 5442d526a448..3694b83af8cc 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1295,9 +1295,16 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { + struct bch_fs *c = ca->fs; + if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) return 0; + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); + return -BCH_ERR_erofs_filesystem_full; + } + unsigned nr; int ret; @@ -1318,7 +1325,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs); + ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); err: bch_err_fn(ca, ret); return ret; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index cc00b0fc40d8..a02f483a016a 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -215,18 +215,20 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; if (nr_online < metadata_replicas_required(c)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" - "rw journal devs:", nr_online, metadata_replicas_required(c)); + if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" + "rw journal devs:", nr_online, metadata_replicas_required(c)); - rcu_read_lock(); - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) - prt_printf(&buf, " %s", ca->name); - rcu_read_unlock(); + rcu_read_lock(); + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) + prt_printf(&buf, " %s", ca->name); + rcu_read_unlock(); - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + } ret = -BCH_ERR_insufficient_journal_devices; goto out; } diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b5ab77f3c692..2436f334dde4 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -734,6 +734,11 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.read_only = true; } + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_info(c, "filesystem is an unresized image file, mounting ro"); + c->opts.read_only = true; + } + mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6ab3e63ef139..7cd075303f95 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -451,6 +451,11 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) return -BCH_ERR_erofs_unfixed_errors; } + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_err(c, "cannot go rw, filesystem is an unresized image file"); + return -BCH_ERR_erofs_filesystem_full; + } + if (test_bit(BCH_FS_rw, &c->flags)) return 0; From 0ca375b1779f22f703f956b22ea2bdbc69c247eb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 14:09:34 -0400 Subject: [PATCH 061/218] bcachefs: BCH_MEMBER_RESIZE_ON_MOUNT Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 18 +++++++--- fs/bcachefs/sb-members.c | 1 + fs/bcachefs/sb-members.h | 1 + fs/bcachefs/sb-members_format.h | 2 ++ fs/bcachefs/sb-members_types.h | 1 + fs/bcachefs/super.c | 64 +++++++++++++++++++++++++++++---- fs/bcachefs/super.h | 2 ++ 7 files changed, 77 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 2436f334dde4..2a8bcb9b1dd2 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -734,11 +734,6 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.read_only = true; } - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_info(c, "filesystem is an unresized image file, mounting ro"); - c->opts.read_only = true; - } - mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; @@ -892,6 +887,17 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; + ret = bch2_fs_resize_on_mount(c); + if (ret) { + up_write(&c->state_lock); + goto err; + } + + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_info(c, "filesystem is an unresized image file, mounting ro"); + c->opts.read_only = true; + } + if (!c->opts.read_only && (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); @@ -954,6 +960,8 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_btree_running, &c->flags); ret = bch2_sb_set_upgrade_extra(c); + if (ret) + goto err; ret = bch2_run_recovery_passes(c); if (ret) diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 39ce94875dde..462a2c21a9de 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -294,6 +294,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); + prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m)); printbuf_indent_sub(out, 2); } diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 0f1741fffcb6..424143f5e330 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -353,6 +353,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi), .valid = bch2_member_alive(mi), .btree_bitmap_shift = mi->btree_bitmap_shift, .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 3affec823b3f..472218a59102 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -88,6 +88,8 @@ LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, struct bch_member, flags, 30, 31) +LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT, + struct bch_member, flags, 31, 32) #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h index c0eda888fe39..d6443e186872 100644 --- a/fs/bcachefs/sb-members_types.h +++ b/fs/bcachefs/sb-members_types.h @@ -13,6 +13,7 @@ struct bch_member_cpu { u8 data_allowed; u8 durability; u8 freespace_initialized; + u8 resize_on_mount; u8 valid; u8 btree_bitmap_shift; u64 btree_allocated_bitmap; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 7cd075303f95..839b1582c1f1 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1141,6 +1141,9 @@ int bch2_fs_start(struct bch_fs *c) for_each_online_member(c, ca) bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + /* + * Dno't write superblock yet: recovery might have to downgrade + */ mutex_unlock(&c->sb_lock); for_each_rw_member(c, ca) @@ -2039,6 +2042,18 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) return 0; } +static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) +{ + struct bch_fs *c = ca->fs; + u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; + + return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: + bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); +} + int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bch_member *m; @@ -2086,13 +2101,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; - - ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod2(trans, false, v, dev_data_type, - .dev = ca->dev_idx, - .data_type = BCH_DATA_free)) ?: - bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); + ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); if (ret) goto err; } @@ -2103,6 +2112,47 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) return ret; } +int bch2_fs_resize_on_mount(struct bch_fs *c) +{ + for_each_online_member(c, ca) { + u64 old_nbuckets = ca->mi.nbuckets; + u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), + ca->mi.bucket_size); + + if (ca->mi.resize_on_mount && + new_nbuckets > ca->mi.nbuckets) { + bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); + int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); + bch_err_fn(ca, ret); + if (ret) { + percpu_ref_put(&ca->io_ref[READ]); + up_write(&c->state_lock); + return ret; + } + + mutex_lock(&c->sb_lock); + struct bch_member *m = + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + m->nbuckets = cpu_to_le64(new_nbuckets); + SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); + + c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + if (ca->mi.freespace_initialized) { + ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); + if (ret) { + percpu_ref_put(&ca->io_ref[READ]); + up_write(&c->state_lock); + return ret; + } + } + } + } + return 0; +} + /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 23533bce5709..50588ab20be2 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -35,6 +35,8 @@ void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); +int bch2_fs_resize_on_mount(struct bch_fs *); + void __bch2_fs_stop(struct bch_fs *); void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); From ecedc87cfaf016e7e857a209e1b2685a28d59566 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 19:33:54 -0400 Subject: [PATCH 062/218] bcachefs: export bch2_move_data_phys() Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 20 ++++++++++---------- fs/bcachefs/move.h | 5 +++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index a4678a205da6..29981ebcb972 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -900,16 +900,16 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, return ret; } -static int bch2_move_data_phys(struct bch_fs *c, - unsigned dev, - u64 start, - u64 end, - unsigned data_types, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) +int bch2_move_data_phys(struct bch_fs *c, + unsigned dev, + u64 start, + u64 end, + unsigned data_types, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) { struct moving_context ctxt; diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 51e0505a8156..1ab6dd4621d6 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -135,6 +135,11 @@ int bch2_move_data(struct bch_fs *, bool, move_pred_fn, void *); +int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, + struct bch_ratelimit *, struct bch_move_stats *, + struct write_point_specifier, bool, + move_pred_fn, void *); + int bch2_evacuate_bucket(struct moving_context *, struct move_bucket_in_flight *, struct bpos, int, From f3c8eaf7a133ef122dfd97e6f6f972265cc84fb0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 19:42:02 -0400 Subject: [PATCH 063/218] bcachefs: Plumb target parameter through btree_node_rewrite_pos() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/btree_update_interior.c | 37 +++++++++++++++++++---------- fs/bcachefs/btree_update_interior.h | 4 ++-- fs/bcachefs/move.c | 5 ++-- 4 files changed, 30 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index b6f5e0dfc9f1..c1c671e340c7 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1918,7 +1918,7 @@ static void btree_node_scrub_work(struct work_struct *work) bch_err(c, "error validating btree node during scrub on %s at btree %s", scrub->ca->name, err.buf); - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0); } err: bch2_trans_iter_exit(trans, &iter); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 2be7c10fc59c..3155b4360fbc 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -284,6 +284,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct disk_reservation *res, struct closure *cl, bool interior_node, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; @@ -317,6 +318,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, mutex_unlock(&c->btree_reserve_cache_lock); retry: ret = bch2_alloc_sectors_start_trans(trans, + target ?: c->opts.metadata_target ?: c->opts.foreground_target, 0, @@ -325,7 +327,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, res->nr_replicas, min(res->nr_replicas, c->opts.metadata_replicas_required), - watermark, 0, cl, &wp); + watermark, + target ? BCH_WRITE_only_specified_devs : 0, + cl, &wp); if (unlikely(ret)) goto err; @@ -505,6 +509,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * static int bch2_btree_reserve_get(struct btree_trans *trans, struct btree_update *as, unsigned nr_nodes[2], + unsigned target, unsigned flags, struct closure *cl) { @@ -527,7 +532,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, while (p->nr < nr_nodes[interior]) { b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, - interior, flags); + interior, target, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err; @@ -1116,7 +1121,8 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level_start, bool split, unsigned flags) + unsigned level_start, bool split, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1226,7 +1232,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); if (bch2_err_matches(ret, ENOSPC) || bch2_err_matches(ret, ENOMEM)) { struct closure cl; @@ -1245,7 +1251,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, closure_init_stack(&cl); do { - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); bch2_trans_unlock(trans); bch2_wait_on_allocator(c, &cl); @@ -1878,7 +1884,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, as = bch2_btree_update_start(trans, trans->paths + path, trans->paths[path].level, - true, flags); + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1948,7 +1954,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, return bch2_btree_split_leaf(trans, path, flags); struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -2077,7 +2084,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, parent = btree_node_parent(trans->paths + path, b); as = bch2_btree_update_start(trans, trans->paths + path, level, false, - BCH_TRANS_COMMIT_no_enospc|flags); + 0, BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; @@ -2184,6 +2191,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; @@ -2196,7 +2204,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_path *path = btree_iter_path(trans, iter); parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, b->c.level, false, flags); + as = bch2_btree_update_start(trans, path, b->c.level, + false, target, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto out; @@ -2261,7 +2270,7 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans, bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, flags) + ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) : -ENOENT; out: bch2_trans_iter_exit(trans, &iter); @@ -2270,7 +2279,9 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans, int bch2_btree_node_rewrite_pos(struct btree_trans *trans, enum btree_id btree, unsigned level, - struct bpos pos, unsigned flags) + struct bpos pos, + unsigned target, + unsigned flags) { BUG_ON(!level); @@ -2282,7 +2293,7 @@ int bch2_btree_node_rewrite_pos(struct btree_trans *trans, if (ret) goto err; - ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -2296,7 +2307,7 @@ int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, if (ret) return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index be71cd73b864..ff9b95aac554 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -168,10 +168,10 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, } int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - struct btree *, unsigned); + struct btree *, unsigned, unsigned); int bch2_btree_node_rewrite_pos(struct btree_trans *, enum btree_id, unsigned, - struct bpos, unsigned); + struct bpos, unsigned, unsigned); int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, struct btree *, unsigned); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 29981ebcb972..d40e2d14ec52 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -872,7 +872,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (!bp.v->level) ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); else if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, + k.k->p, data_opts.target, 0); else ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); @@ -1022,7 +1023,7 @@ static int bch2_move_btree(struct bch_fs *c, if (!pred(c, arg, b, &io_opts, &data_opts)) goto next; - ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) From 7a274285d3706608d788efcbd9982f08531dd9ec Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Apr 2025 19:51:05 -0400 Subject: [PATCH 064/218] bcachefs: plumb btree_id through move_pred_fd Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 20 +++++++++++--------- fs/bcachefs/move.h | 2 +- fs/bcachefs/rebalance.c | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index d40e2d14ec52..07cea68b04f0 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -667,7 +667,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, continue; memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, k, io_opts, &data_opts)) + if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) goto next; /* @@ -851,7 +851,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, } struct data_update_opts data_opts = {}; - if (!pred(c, arg, k, &io_opts, &data_opts)) { + if (!pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts)) { bch2_trans_iter_exit(trans, &iter); goto next; } @@ -934,7 +934,8 @@ struct evacuate_bucket_arg { struct data_update_opts data_opts; }; -static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, +static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1048,7 +1049,7 @@ static int bch2_move_btree(struct bch_fs *c, } static bool rereplicate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1080,7 +1081,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, } static bool migrate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1107,7 +1108,7 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts); } /* @@ -1163,7 +1164,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) } static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1196,11 +1197,12 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), + io_opts, data_opts); } static bool scrub_pred(struct bch_fs *c, void *_arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 1ab6dd4621d6..9c6c229e583e 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -72,7 +72,7 @@ do { \ break; \ } while (1) -typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, +typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); extern const char * const bch2_data_ops_strs[]; diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 3c45500c1a28..d2a7001cf872 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -454,7 +454,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, } static bool rebalance_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { From 3484840ece849ee700c7cf8e0d44d5536b29fa08 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 16:31:34 -0400 Subject: [PATCH 065/218] bcachefs: bch2_move_data_btree() can move btree nodes Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 37 ++++++++++++++++++++++++------------- fs/bcachefs/move.h | 2 ++ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 07cea68b04f0..a8ad8d4538e0 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -423,6 +423,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; int ret = 0; + if (extent_iter->min_depth) + return opts_ret; + if (extent_k.k->type == KEY_TYPE_reflink_v) goto out; @@ -573,11 +576,11 @@ static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans * return k; } -static int bch2_move_data_btree(struct moving_context *ctxt, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - enum btree_id btree_id) +int bch2_move_data_btree(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id, unsigned level) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -604,10 +607,10 @@ static int bch2_move_data_btree(struct moving_context *ctxt, } bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, btree_id, start, - BTREE_ITER_prefetch| - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); @@ -627,7 +630,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ret) break; - if (bkey_ge(bkey_start_pos(k.k), end)) + if (bkey_gt(bkey_start_pos(k.k), end)) break; if (ctxt->stats) @@ -677,7 +680,14 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + if (!level) + ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + else if (!data_opts.scrub) + ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, + k.k->p, data_opts.target, 0); + else + ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); + if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; @@ -695,7 +705,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); next_nondata: - bch2_btree_iter_advance(trans, &iter); + if (!bch2_btree_iter_advance(trans, &iter)) + break; } bch2_trans_iter_exit(trans, &reflink_iter); @@ -727,7 +738,7 @@ int __bch2_move_data(struct moving_context *ctxt, ret = bch2_move_data_btree(ctxt, id == start.btree ? start.pos : POS_MIN, id == end.btree ? end.pos : POS_MAX, - pred, arg, id); + pred, arg, id, 0); if (ret) break; } diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 9c6c229e583e..0c620a5f728d 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -122,6 +122,8 @@ int bch2_move_extent(struct moving_context *, struct bch_io_opts, struct data_update_opts); +int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, + move_pred_fn, void *, enum btree_id, unsigned); int __bch2_move_data(struct moving_context *, struct bbpos, struct bbpos, From fe27298b92001d51797ddc26ca0d7c3d4a0f04d4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 21:35:28 -0400 Subject: [PATCH 066/218] bcachefs: bch2_move_data_btree() can now walk roots Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 47 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index a8ad8d4538e0..ff56d8886c32 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -606,7 +606,52 @@ int bch2_move_data_btree(struct moving_context *ctxt, ctxt->stats->pos = BBPOS(btree_id, start); } +retry_root: bch2_trans_begin(trans); + + if (level == bch2_btree_id_root(c, btree_id)->level + 1) { + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto root_err; + + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } + + k = bkey_i_to_s_c(&b->key); + + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, + iter.pos, &iter, k); + ret = PTR_ERR_OR_ZERO(io_opts); + if (ret) + goto root_err; + + memset(&data_opts, 0, sizeof(data_opts)); + if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) + goto out; + + + if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, + k.k->p, data_opts.target, 0); + else + ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); + +root_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } + + goto out; + } + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, BTREE_ITER_prefetch| BTREE_ITER_not_extents| @@ -708,7 +753,7 @@ int bch2_move_data_btree(struct moving_context *ctxt, if (!bch2_btree_iter_advance(trans, &iter)) break; } - +out: bch2_trans_iter_exit(trans, &reflink_iter); bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); From 9e260e4590e044dc5887f9eb21dfaf479226e7d4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 20 Apr 2024 17:40:47 -0400 Subject: [PATCH 067/218] docs: bcachefs: idle work scheduling design doc People have been asking to see the plan for this, so - bcachefs has various background tasks that need to be scheduled to balance efficiency, predictability of performance, etc. The design and philosophy hasn't changed too much since bcache, which was primarily designed for server usage, with sustained load in mind. These days we're seeing more desktop usage - where we really want to let the system idle effictively, to reduce total power usage - while also still balancing previous concerns, we still want to let work accumulate to a degree. This lays out all the requirements and starts to sketch out the algorithm I have in mind. Signed-off-by: Kent Overstreet --- .../filesystems/bcachefs/future/idle_work.rst | 78 +++++++++++++++++++ Documentation/filesystems/bcachefs/index.rst | 7 ++ 2 files changed, 85 insertions(+) create mode 100644 Documentation/filesystems/bcachefs/future/idle_work.rst diff --git a/Documentation/filesystems/bcachefs/future/idle_work.rst b/Documentation/filesystems/bcachefs/future/idle_work.rst new file mode 100644 index 000000000000..59a332509dcd --- /dev/null +++ b/Documentation/filesystems/bcachefs/future/idle_work.rst @@ -0,0 +1,78 @@ +Idle/background work classes design doc: + +Right now, our behaviour at idle isn't ideal, it was designed for servers that +would be under sustained load, to keep pending work at a "medium" level, to +let work build up so we can process it in more efficient batches, while also +giving headroom for bursts in load. + +But for desktops or mobile - scenarios where work is less sustained and power +usage is more important - we want to operate differently, with a "rush to +idle" so the system can go to sleep. We don't want to be dribbling out +background work while the system should be idle. + +The complicating factor is that there are a number of background tasks, which +form a heirarchy (or a digraph, depending on how you divide it up) - one +background task may generate work for another. + +Thus proper idle detection needs to model this heirarchy. + +- Foreground writes +- Page cache writeback +- Copygc, rebalance +- Journal reclaim + +When we implement idle detection and rush to idle, we need to be careful not +to disturb too much the existing behaviour that works reasonably well when the +system is under sustained load (or perhaps improve it in the case of +rebalance, which currently does not actively attempt to let work batch up). + +SUSTAINED LOAD REGIME +--------------------- + +When the system is under continuous load, we want these jobs to run +continuously - this is perhaps best modelled with a P/D controller, where +they'll be trying to keep a target value (i.e. fragmented disk space, +available journal space) roughly in the middle of some range. + +The goal under sustained load is to balance our ability to handle load spikes +without running out of x resource (free disk space, free space in the +journal), while also letting some work accumululate to be batched (or become +unnecessary). + +For example, we don't want to run copygc too aggressively, because then it +will be evacuating buckets that would have become empty (been overwritten or +deleted) anyways, and we don't want to wait until we're almost out of free +space because then the system will behave unpredicably - suddenly we're doing +a lot more work to service each write and the system becomes much slower. + +IDLE REGIME +----------- + +When the system becomes idle, we should start flushing our pending work +quicker so the system can go to sleep. + +Note that the definition of "idle" depends on where in the heirarchy a task +is - a task should start flushing work more quickly when the task above it has +stopped generating new work. + +e.g. rebalance should start flushing more quickly when page cache writeback is +idle, and journal reclaim should only start flushing more quickly when both +copygc and rebalance are idle. + +It's important to let work accumulate when more work is still incoming and we +still have room, because flushing is always more efficient if we let it batch +up. New writes may overwrite data before rebalance moves it, and tasks may be +generating more updates for the btree nodes that journal reclaim needs to flush. + +On idle, how much work we do at each interval should be proportional to the +length of time we have been idle for. If we're idle only for a short duration, +we shouldn't flush everything right away; the system might wake up and start +generating new work soon, and flushing immediately might end up doing a lot of +work that would have been unnecessary if we'd allowed things to batch more. + +To summarize, we will need: + + - A list of classes for background tasks that generate work, which will + include one "foreground" class. + - Tracking for each class - "Am I doing work, or have I gone to sleep?" + - And each class should check the class above it when deciding how much work to issue. diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst index 3864d0ae89c1..e5c4c2120b93 100644 --- a/Documentation/filesystems/bcachefs/index.rst +++ b/Documentation/filesystems/bcachefs/index.rst @@ -29,3 +29,10 @@ At this moment, only a few of these are described here. casefolding errorcodes + +Future design +------------- +.. toctree:: + :maxdepth: 1 + + future/idle_work From 62095464e9d2a2340a6b08a90fb280ea2b091a28 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 17 Apr 2025 12:42:13 -0400 Subject: [PATCH 068/218] bcachefs: Fix struct with flex member ABI warning This pops up when buliding in userspace. The structs aren't actually variable length, but no way to tell the compiler that... Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 24 ++++++++++++------------ fs/bcachefs/disk_accounting.h | 8 ++++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 1f0422bfae35..e399237e124a 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -631,17 +631,17 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) } static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - struct disk_accounting_pos acc, + struct disk_accounting_pos *acc, u64 *v, unsigned nr) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0, invalid_dev = -1; - switch (acc.type) { + switch (acc->type) { case BCH_DISK_ACCOUNTING_replicas: { struct bch_replicas_padded r; - __accounting_to_replicas(&r.e, &acc); + __accounting_to_replicas(&r.e, acc); for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && @@ -660,7 +660,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, trans, accounting_replicas_not_marked, "accounting not marked in superblock replicas\n%s", (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &acc), + bch2_accounting_key_to_text(&buf, acc), buf.buf))) { /* * We're not RW yet and still single threaded, dropping @@ -676,8 +676,8 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, } case BCH_DISK_ACCOUNTING_dev_data_type: - if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { - invalid_dev = acc.dev_data_type.dev; + if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { + invalid_dev = acc->dev_data_type.dev; goto invalid_device; } break; @@ -691,13 +691,13 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, "accounting entry points to invalid device %i\n%s", invalid_dev, (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &acc), + bch2_accounting_key_to_text(&buf, acc), buf.buf))) { for (unsigned i = 0; i < nr; i++) v[i] = -v[i]; ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: -BCH_ERR_remove_disk_accounting_entry; } else { ret = -BCH_ERR_remove_disk_accounting_entry; @@ -748,7 +748,7 @@ int bch2_accounting_read(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; - if (!bch2_accounting_is_mem(acc_k)) { + if (!bch2_accounting_is_mem(&acc_k)) { struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; @@ -770,7 +770,7 @@ int bch2_accounting_read(struct bch_fs *c) struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); - if (!bch2_accounting_is_mem(acc_k)) + if (!bch2_accounting_is_mem(&acc_k)) continue; struct bkey_s_c k = bkey_i_to_s_c(i->k); @@ -826,7 +826,7 @@ int bch2_accounting_read(struct bch_fs *c) */ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); + : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { free_percpu(i->v[0]); @@ -939,7 +939,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; - if (!bch2_accounting_is_mem(acc_k)) { + if (!bch2_accounting_is_mem(&acc_k)) { struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index d557b99b3c0a..54cb8a5b117d 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -139,10 +139,10 @@ int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); -static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) +static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) { - return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && - acc.type != BCH_DISK_ACCOUNTING_inum; + return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR && + acc->type != BCH_DISK_ACCOUNTING_inum; } /* @@ -163,7 +163,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, if (gc && !acc->gc_running) return 0; - if (!bch2_accounting_is_mem(acc_k)) + if (!bch2_accounting_is_mem(&acc_k)) return 0; if (mode == BCH_ACCOUNTING_normal) { From 09279bba72f809eeb1f02d39a462e8e1d06fa32a Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Fri, 18 Apr 2025 15:52:10 +0800 Subject: [PATCH 069/218] bcachefs: Kill dead code Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index a02f483a016a..fd7a140c9fd6 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -637,8 +637,6 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - nr_buckets = min(nr_buckets, ja->nr); - bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; seq_to_flush = max(seq_to_flush, ja->bucket_seq[bucket_to_flush]); From 834f9475aabd84f60760ac8ceffc45eedff4a176 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 14 Mar 2025 09:46:25 -0400 Subject: [PATCH 070/218] bcachefs: bch2_check_rebalance_work() Add a pass for checking the rebalance_work btree. Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 116 ++++++++++++++++++++++++++++ fs/bcachefs/rebalance.h | 2 + fs/bcachefs/recovery_passes_types.h | 1 + 3 files changed, 119 insertions(+) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index d2a7001cf872..26c87ab019e8 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -712,3 +712,119 @@ void bch2_fs_rebalance_init(struct bch_fs *c) { bch2_pd_controller_init(&c->rebalance.pd); } + +static int check_rebalance_work_one(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct btree_iter *rebalance_iter, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c extent_k, rebalance_k; + struct printbuf buf = PRINTBUF; + + int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?: + bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter)); + if (ret) + return ret; + + if (!extent_k.k && + extent_iter->btree_id == BTREE_ID_reflink && + (!rebalance_k.k || + rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { + bch2_trans_iter_exit(trans, extent_iter); + bch2_trans_iter_init(trans, extent_iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots); + return -BCH_ERR_transaction_restart_nested; + } + + if (!extent_k.k && !rebalance_k.k) + return 1; + + int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, + rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); + + struct bkey deleted; + bkey_init(&deleted); + + if (cmp < 0) { + deleted.p = extent_k.k->p; + rebalance_k.k = &deleted; + } else if (cmp > 0) { + deleted.p = rebalance_k.k->p; + extent_k.k = &deleted; + } + + bool should_have_rebalance = + bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; + bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; + + if (should_have_rebalance != have_rebalance) { + ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); + if (ret) + return ret; + + bch2_bkey_val_to_text(&buf, c, extent_k); + } + + if (fsck_err_on(!should_have_rebalance && have_rebalance, + trans, rebalance_work_incorrectly_set, + "rebalance work incorrectly set\n%s", buf.buf)) { + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + extent_k.k->p, false); + if (ret) + goto err; + } + + if (fsck_err_on(should_have_rebalance && !have_rebalance, + trans, rebalance_work_incorrectly_unset, + "rebalance work incorrectly unset\n%s", buf.buf)) { + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + extent_k.k->p, true); + if (ret) + goto err; + } + + if (cmp <= 0) + bch2_btree_iter_advance(trans, extent_iter); + if (cmp >= 0) + bch2_btree_iter_advance(trans, rebalance_iter); +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_check_rebalance_work(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter rebalance_iter, extent_iter; + int ret = 0; + + bch2_trans_iter_init(trans, &extent_iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch); + bch2_trans_iter_init(trans, &rebalance_iter, + BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_prefetch); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + while (!ret) { + bch2_trans_begin(trans); + + ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + } + + bch2_bkey_buf_exit(&last_flushed, c); + bch2_trans_iter_exit(trans, &extent_iter); + bch2_trans_iter_exit(trans, &rebalance_iter); + bch2_trans_put(trans); + return ret < 0 ? ret : 0; +} diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index e5e8eb4a2dd1..b7c8c0652ad6 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -54,4 +54,6 @@ void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); void bch2_fs_rebalance_init(struct bch_fs *); +int bch2_check_rebalance_work(struct bch_fs *); + #endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index f9d565bb50dd..be3185fc6ef4 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -59,6 +59,7 @@ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ + x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ x(delete_dead_inodes, 32, PASS_ALWAYS) \ x(fix_reflink_p, 33, 0) \ From c53be0ffaa501d25f58ac2e56b7e5710f3408a50 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 19 Apr 2025 00:57:55 -0400 Subject: [PATCH 071/218] bcachefs: bch2_target_to_text() no longer depends on io_ref Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 2ca3cbf12b71..4e2f237338c2 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -554,14 +554,12 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) ? rcu_dereference(c->devs[t.dev]) : NULL; - if (ca && percpu_ref_tryget(&ca->io_ref[READ])) { + if (ca && ca->disk_sb.bdev) prt_printf(out, "/dev/%s", ca->name); - percpu_ref_put(&ca->io_ref[READ]); - } else if (ca) { + else if (ca) prt_printf(out, "offline device %u", t.dev); - } else { + else prt_printf(out, "invalid device %u", t.dev); - } rcu_read_unlock(); out->atomic--; From 2483dd1243584432c8b407f09673500c00095cd3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Apr 2025 11:23:53 -0400 Subject: [PATCH 072/218] bcachefs: recalc_capacity() no longer depends on io_ref Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 6ac8bd49c629..0494d188605f 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2505,15 +2505,15 @@ void bch2_recalc_capacity(struct bch_fs *c) lockdep_assert_held(&c->state_lock); - for_each_online_member(c, ca) { - struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; + rcu_read_lock(); + for_each_member_device_rcu(c, ca, NULL) { + struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); + if (bdev) + ra_pages += bdev->bd_disk->bdi->ra_pages; - ra_pages += bdi->ra_pages; - } + if (ca->mi.state != BCH_MEMBER_STATE_rw) + continue; - bch2_set_ra_pages(c, ra_pages); - - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { u64 dev_reserve = 0; /* @@ -2550,6 +2550,9 @@ void bch2_recalc_capacity(struct bch_fs *c) bucket_size_max = max_t(unsigned, bucket_size_max, ca->mi.bucket_size); } + rcu_read_unlock(); + + bch2_set_ra_pages(c, ra_pages); gc_reserve = c->opts.gc_reserve_bytes ? c->opts.gc_reserve_bytes >> 9 From 9fa4a8a3bdb14c1a36ff439643e00ca416e04b66 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 18 Apr 2025 22:11:15 -0400 Subject: [PATCH 073/218] bcachefs: for_each_online_member_rcu() Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 10 +++++++++- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/chardev.c | 6 ++++-- fs/bcachefs/fs.c | 9 ++++++--- fs/bcachefs/sb-members.h | 14 +++----------- fs/bcachefs/super.c | 16 ++++++++++++---- 6 files changed, 35 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index d56cee7e8cb5..e87b95f609c5 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1645,7 +1645,12 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) printbuf_indent_sub(&buf, 2); prt_newline(&buf); - for_each_online_member(c, ca) { + bch2_printbuf_make_room(&buf, 4096); + + rcu_read_lock(); + buf.atomic++; + + for_each_online_member_rcu(c, ca) { prt_printf(&buf, "Dev %u:\n", ca->dev_idx); printbuf_indent_add(&buf, 2); bch2_dev_alloc_debug_to_text(&buf, ca); @@ -1653,6 +1658,9 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) prt_newline(&buf); } + --buf.atomic; + rcu_read_unlock(); + prt_printf(&buf, "Copygc debug:\n"); printbuf_indent_add(&buf, 2); bch2_copygc_wait_to_text(&buf, c); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 4fd096349790..1597259b708c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -891,6 +891,7 @@ struct bch_fs { struct workqueue_struct *write_ref_wq; /* ALLOCATION */ + struct bch_devs_mask online_devs; struct bch_devs_mask rw_devs[BCH_DATA_NR]; unsigned long rw_devs_change_count; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 5891b3a1e61c..4066946b26bc 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -613,11 +613,13 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!dev) return -EINVAL; - for_each_online_member(c, ca) + rcu_read_lock(); + for_each_online_member_rcu(c, ca) if (ca->dev == dev) { - percpu_ref_put(&ca->io_ref[READ]); + rcu_read_unlock(); return ca->dev_idx; } + rcu_read_unlock(); return -BCH_ERR_ENOENT_dev_idx_not_found; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 17a27d6d8c9d..cdf84180829a 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2327,12 +2327,14 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) struct bch_fs *c = root->d_sb->s_fs_info; bool first = true; - for_each_online_member(c, ca) { + rcu_read_lock(); + for_each_online_member_rcu(c, ca) { if (!first) seq_putc(seq, ':'); first = false; seq_puts(seq, ca->disk_sb.sb_name); } + rcu_read_unlock(); return 0; } @@ -2529,15 +2531,16 @@ static int bch2_fs_get_tree(struct fs_context *fc) sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - for_each_online_member(c, ca) { + rcu_read_lock(); + for_each_online_member_rcu(c, ca) { struct block_device *bdev = ca->disk_sb.bdev; /* XXX: create an anonymous device for multi device filesystems */ sb->s_bdev = bdev; sb->s_dev = bdev->bd_dev; - percpu_ref_put(&ca->io_ref[READ]); break; } + rcu_read_unlock(); c->dev = sb->s_dev; diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 424143f5e330..28c6fc25c32c 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -104,6 +104,9 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * for (struct bch_dev *_ca = NULL; \ (_ca = __bch2_next_dev((_c), _ca, (_mask)));) +#define for_each_online_member_rcu(_c, _ca) \ + for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) + static inline void bch2_dev_get(struct bch_dev *ca) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -307,17 +310,6 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, return NULL; } -/* XXX kill, move to struct bch_fs */ -static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -{ - struct bch_devs_mask devs; - - memset(&devs, 0, sizeof(devs)); - for_each_online_member(c, ca) - __set_bit(ca->dev_idx, devs.d); - return devs; -} - extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 839b1582c1f1..834c68a273b4 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1105,7 +1105,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) break; } - return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); + return bch2_have_enough_devs(c, c->online_devs, flags, true); } int bch2_fs_start(struct bch_fs *c) @@ -1138,8 +1138,11 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - for_each_online_member(c, ca) - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + rcu_read_lock(); + for_each_online_member_rcu(c, ca) + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(now); + rcu_read_unlock(); /* * Dno't write superblock yet: recovery might have to downgrade @@ -1294,6 +1297,9 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) { + if (rw == READ) + clear_bit(ca->dev_idx, ca->fs->online_devs.d); + if (!percpu_ref_is_zero(&ca->io_ref[rw])) { reinit_completion(&ca->io_ref_completion[rw]); percpu_ref_kill(&ca->io_ref[rw]); @@ -1577,6 +1583,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; + set_bit(ca->dev_idx, c->online_devs.d); + bch2_dev_sysfs_online(c, ca); struct printbuf name = PRINTBUF; @@ -1634,7 +1642,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, return true; /* do we have enough devices to read from? */ - new_online_devs = bch2_online_devs(c); + new_online_devs = c->online_devs; __clear_bit(ca->dev_idx, new_online_devs.d); return bch2_have_enough_devs(c, new_online_devs, flags, false); From e14e06e91dadcd1c65f08ba5a02716d3e855fc74 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Apr 2025 11:27:18 -0400 Subject: [PATCH 074/218] bcachefs: __bch2_fs_read_write() no longer depends on io_ref Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 834c68a273b4..cdcfed4dd283 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -471,10 +471,14 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) clear_bit(BCH_FS_clean_shutdown, &c->flags); - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { - bch2_dev_allocator_add(c, ca); - percpu_ref_reinit(&ca->io_ref[WRITE]); - } + rcu_read_lock(); + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) { + bch2_dev_allocator_add(c, ca); + percpu_ref_reinit(&ca->io_ref[WRITE]); + } + rcu_read_unlock(); + bch2_recalc_capacity(c); /* @@ -1149,8 +1153,11 @@ int bch2_fs_start(struct bch_fs *c) */ mutex_unlock(&c->sb_lock); - for_each_rw_member(c, ca) - bch2_dev_allocator_add(c, ca); + rcu_read_lock(); + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) + bch2_dev_allocator_add(c, ca); + rcu_read_unlock(); bch2_recalc_capacity(c); up_write(&c->state_lock); From 6d67de1079993e09e7a867a6936a8163ece98792 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 19 Apr 2025 19:13:40 -0400 Subject: [PATCH 075/218] bcachefs: for_each_rw_member_rcu() Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 10 ++++++++-- fs/bcachefs/journal.c | 4 +++- fs/bcachefs/journal_io.c | 5 +---- fs/bcachefs/journal_reclaim.c | 4 +++- fs/bcachefs/movinggc.c | 4 +++- fs/bcachefs/sb-members.h | 3 +++ 6 files changed, 21 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 0494d188605f..195d20220b7d 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2575,8 +2575,10 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) { u64 ret = U64_MAX; - for_each_rw_member(c, ca) + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); + rcu_read_unlock(); return ret; } @@ -2600,8 +2602,12 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) { + /* BCH_DATA_free == all rw devs */ + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (rw && (ca->mi.data_allowed & BIT(i))) + if (rw && + (i == BCH_DATA_free || + (ca->mi.data_allowed & BIT(i)))) set_bit(ca->dev_idx, c->rw_devs[i].d); else clear_bit(ca->dev_idx, c->rw_devs[i].d); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 3694b83af8cc..e1cd6e8e37cf 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -699,8 +699,10 @@ static unsigned max_dev_latency(struct bch_fs *c) { u64 nsecs = 0; - for_each_rw_member(c, ca) + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); + rcu_read_unlock(); return nsecs_to_jiffies(nsecs); } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 28fa381cd589..438ad32ba242 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -2055,12 +2055,9 @@ CLOSURE_CALLBACK(bch2_journal_write) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - unsigned nr_rw_members = 0; + unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]); int ret; - for_each_rw_member(c, ca) - nr_rw_members++; - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); BUG_ON(!w->write_started); BUG_ON(w->write_allocated); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index fd7a140c9fd6..dc8169a970dd 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -627,7 +627,8 @@ static u64 journal_seq_to_flush(struct journal *j) spin_lock(&j->lock); - for_each_rw_member(c, ca) { + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -641,6 +642,7 @@ static u64 journal_seq_to_flush(struct journal *j) seq_to_flush = max(seq_to_flush, ja->bucket_seq[bucket_to_flush]); } + rcu_read_unlock(); /* Also flush if the pin fifo is more than half full */ seq_to_flush = max_t(s64, seq_to_flush, diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 96873372b516..e97e87ebe312 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -279,7 +279,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) { s64 wait = S64_MAX, fragmented_allowed, fragmented; - for_each_rw_member(c, ca) { + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) { struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); struct bch_dev_usage usage; @@ -296,6 +297,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) wait = min(wait, max(0LL, fragmented_allowed - fragmented)); } + rcu_read_unlock(); return wait; } diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 28c6fc25c32c..c71a1ba61525 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -107,6 +107,9 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * #define for_each_online_member_rcu(_c, _ca) \ for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) +#define for_each_rw_member_rcu(_c, _ca) \ + for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free]) + static inline void bch2_dev_get(struct bch_dev *ca) { #ifdef CONFIG_BCACHEFS_DEBUG From f5241e41272858b983da45ac7f8a6ad58c4ba71f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 18 Apr 2025 14:56:09 -0400 Subject: [PATCH 076/218] bcachefs: enumerated_ref.c Factor out the debug code for rw filesystem refs into a small library. In release mode an enumerated ref is a normal percpu refcount, but in debug mode all enumerated users of the ref get their own atomic_long_t ref - making it much easier to chase down refcount usage bugs for when a refcount has many users. For debugging, we have enumerated_ref_to_text(), which prints the current value of each different user. Additionally, in debug mode enumerated_ref_stop() has a 10 second timeout, after which it will dump outstanding refcounts. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/enumerated_ref.c | 144 +++++++++++++++++++++++++++++ fs/bcachefs/enumerated_ref.h | 66 +++++++++++++ fs/bcachefs/enumerated_ref_types.h | 19 ++++ 4 files changed, 230 insertions(+) create mode 100644 fs/bcachefs/enumerated_ref.c create mode 100644 fs/bcachefs/enumerated_ref.h create mode 100644 fs/bcachefs/enumerated_ref_types.h diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 9af65079374f..d2b8aec6ed8c 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -35,6 +35,7 @@ bcachefs-y := \ disk_accounting.o \ disk_groups.o \ ec.o \ + enumerated_ref.o \ errcode.o \ error.o \ extents.o \ diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c new file mode 100644 index 000000000000..56ab430f209f --- /dev/null +++ b/fs/bcachefs/enumerated_ref.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "enumerated_ref.h" +#include "util.h" + +#include + +#ifdef ENUMERATED_REF_DEBUG +void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + atomic_long_inc(&ref->refs[idx]); +} + +bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + return atomic_long_inc_not_zero(&ref->refs[idx]); +} + +bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + return !ref->dying && + atomic_long_inc_not_zero(&ref->refs[idx]); +} + +void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + long v = atomic_long_dec_return(&ref->refs[idx]); + + BUG_ON(v < 0); + if (v) + return; + + for (unsigned i = 0; i < ref->nr; i++) + if (atomic_long_read(&ref->refs[i])) + return; + + if (ref->stop_fn) + ref->stop_fn(ref); + complete(&ref->stop_complete); +} +#endif + +#ifndef ENUMERATED_REF_DEBUG +static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref) +{ + struct enumerated_ref *ref = + container_of(percpu_ref, struct enumerated_ref, ref); + + if (ref->stop_fn) + ref->stop_fn(ref); + complete(&ref->stop_complete); +} +#endif + +void enumerated_ref_stop_async(struct enumerated_ref *ref) +{ + reinit_completion(&ref->stop_complete); + +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_kill(&ref->ref); +#else + ref->dying = true; + for (unsigned i = 0; i < ref->nr; i++) + enumerated_ref_put(ref, i); +#endif +} + +void enumerated_ref_stop(struct enumerated_ref *ref, + const char * const names[]) +{ + enumerated_ref_stop_async(ref); + while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n"); + prt_str(&buf, "Outstanding refs:\n"); + enumerated_ref_to_text(&buf, ref, names); + printk(KERN_ERR "%s", buf.buf); + printbuf_exit(&buf); + } +} + +void enumerated_ref_start(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_reinit(&ref->ref); +#else + ref->dying = false; + for (unsigned i = 0; i < ref->nr; i++) { + BUG_ON(atomic_long_read(&ref->refs[i])); + atomic_long_inc(&ref->refs[i]); + } +#endif +} + +void enumerated_ref_exit(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_exit(&ref->ref); +#else + kfree(ref->refs); + ref->refs = NULL; + ref->nr = 0; +#endif +} + +int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr, + void (*stop_fn)(struct enumerated_ref *)) +{ + init_completion(&ref->stop_complete); + ref->stop_fn = stop_fn; + +#ifndef ENUMERATED_REF_DEBUG + return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb, + PERCPU_REF_INIT_DEAD, GFP_KERNEL); +#else + ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL); + if (!ref->refs) + return -ENOMEM; + + ref->nr = nr; + return 0; +#endif +} + +void enumerated_ref_to_text(struct printbuf *out, + struct enumerated_ref *ref, + const char * const names[]) +{ +#ifdef ENUMERATED_REF_DEBUG + bch2_printbuf_tabstop_push(out, 32); + + for (unsigned i = 0; i < ref->nr; i++) + prt_printf(out, "%s\t%li\n", names[i], + atomic_long_read(&ref->refs[i])); +#else + prt_str(out, "(not in debug mode)\n"); +#endif +} diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h new file mode 100644 index 000000000000..ec01cf59ef80 --- /dev/null +++ b/fs/bcachefs/enumerated_ref.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ENUMERATED_REF_H +#define _BCACHEFS_ENUMERATED_REF_H + +#include "enumerated_ref_types.h" + +/* + * A refcount where the users are enumerated: in debug mode, we create sepate + * refcounts for each user, to make leaks and refcount errors easy to track + * down: + */ + +#ifdef ENUMERATED_REF_DEBUG +void enumerated_ref_get(struct enumerated_ref *, unsigned); +bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned); +bool enumerated_ref_tryget(struct enumerated_ref *, unsigned); +void enumerated_ref_put(struct enumerated_ref *, unsigned); +#else + +static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) +{ + percpu_ref_get(&ref->ref); +} + +static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + return percpu_ref_tryget(&ref->ref); +} + +static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + return percpu_ref_tryget_live(&ref->ref); +} + +static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) +{ + percpu_ref_put(&ref->ref); +} +#endif + +static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + return percpu_ref_is_zero(&ref->ref); +#else + for (unsigned i = 0; i < ref->nr; i++) + if (atomic_long_read(&ref->refs[i])) + return false; + return true; +#endif +} + +void enumerated_ref_stop_async(struct enumerated_ref *); +void enumerated_ref_stop(struct enumerated_ref *, const char * const[]); +void enumerated_ref_start(struct enumerated_ref *); + +void enumerated_ref_exit(struct enumerated_ref *); +int enumerated_ref_init(struct enumerated_ref *, unsigned, + void (*stop_fn)(struct enumerated_ref *)); + +struct printbuf; +void enumerated_ref_to_text(struct printbuf *, + struct enumerated_ref *, + const char * const[]); + +#endif /* _BCACHEFS_ENUMERATED_REF_H */ diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h new file mode 100644 index 000000000000..0e6076f466d3 --- /dev/null +++ b/fs/bcachefs/enumerated_ref_types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H +#define _BCACHEFS_ENUMERATED_REF_TYPES_H + +#include + +struct enumerated_ref { +#ifdef ENUMERATED_REF_DEBUG + unsigned nr; + bool dying; + atomic_long_t *refs; +#else + struct percpu_ref ref; +#endif + void (*stop_fn)(struct enumerated_ref *); + struct completion stop_complete; +}; + +#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */ From c9b1d94a2196fcfd1985ff8728b22abda400b1ac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 18 Apr 2025 14:56:09 -0400 Subject: [PATCH 077/218] bcachefs: bch_fs.writes -> enumerated_refs Drop the single-purpose write ref code in bcachefs.h, and convert to enumarated refs. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 23 ++++++------ fs/bcachefs/bcachefs.h | 57 ++--------------------------- fs/bcachefs/btree_gc.c | 7 ++-- fs/bcachefs/btree_io.c | 7 ++-- fs/bcachefs/btree_trans_commit.c | 5 ++- fs/bcachefs/btree_update_interior.c | 7 ++-- fs/bcachefs/btree_write_buffer.c | 11 +++--- fs/bcachefs/ec.c | 13 ++++--- fs/bcachefs/fs-io-direct.c | 7 ++-- fs/bcachefs/fs-io.c | 9 +++-- fs/bcachefs/io_read.c | 7 ++-- fs/bcachefs/io_write.c | 5 ++- fs/bcachefs/journal.c | 5 ++- fs/bcachefs/reflink.c | 5 ++- fs/bcachefs/snapshot.c | 7 ++-- fs/bcachefs/subvolume.c | 7 ++-- fs/bcachefs/super.c | 31 ++++------------ fs/bcachefs/sysfs.c | 26 +++---------- 18 files changed, 86 insertions(+), 153 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 195d20220b7d..ced31309c541 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -17,6 +17,7 @@ #include "debug.h" #include "disk_accounting.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "lru.h" #include "recovery.h" @@ -1381,7 +1382,7 @@ static void check_discard_freespace_key_work(struct work_struct *work) container_of(work, struct check_discard_freespace_key_async, work); bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); - bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); + enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key); kfree(w); } @@ -1458,7 +1459,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite if (!w) goto out; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) { kfree(w); goto out; } @@ -1953,14 +1954,14 @@ static void bch2_do_discards_work(struct work_struct *work) bch2_err_str(ret)); percpu_ref_put(&ca->io_ref[WRITE]); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } void bch2_dev_do_discards(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard)) return; if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) @@ -1971,7 +1972,7 @@ void bch2_dev_do_discards(struct bch_dev *ca) percpu_ref_put(&ca->io_ref[WRITE]); put_write_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) @@ -2048,7 +2049,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) bch2_trans_put(trans); percpu_ref_put(&ca->io_ref[WRITE]); - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); } static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) @@ -2058,7 +2059,7 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) if (discard_in_flight_add(ca, bucket, false)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast)) return; if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) @@ -2069,7 +2070,7 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) percpu_ref_put(&ca->io_ref[WRITE]); put_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); } static int invalidate_one_bp(struct btree_trans *trans, @@ -2263,14 +2264,14 @@ static void bch2_do_invalidates_work(struct work_struct *work) bch2_trans_put(trans); percpu_ref_put(&ca->io_ref[WRITE]); bch2_bkey_buf_exit(&last_flushed, c); - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); } void bch2_dev_do_invalidates(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate)) return; if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) @@ -2281,7 +2282,7 @@ void bch2_dev_do_invalidates(struct bch_dev *ca) percpu_ref_put(&ca->io_ref[WRITE]); put_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 1597259b708c..ced80d4b606a 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -219,7 +219,7 @@ #include "util.h" #ifdef CONFIG_BCACHEFS_DEBUG -#define BCH_WRITE_REF_DEBUG +#define ENUMERATED_REF_DEBUG #endif #ifndef dynamic_fault @@ -483,6 +483,7 @@ enum bch_time_stats { #include "clock_types.h" #include "disk_groups_types.h" #include "ec_types.h" +#include "enumerated_ref_types.h" #include "journal_types.h" #include "keylist_types.h" #include "quota_types.h" @@ -733,11 +734,7 @@ struct bch_fs { struct rw_semaphore state_lock; /* Counts outstanding writes, for clean transition to read-only */ -#ifdef BCH_WRITE_REF_DEBUG - atomic_long_t writes[BCH_WRITE_REF_NR]; -#else - struct percpu_ref writes; -#endif + struct enumerated_ref writes; /* * Certain operations are only allowed in single threaded mode, during * recovery, and we want to assert that this is the case: @@ -1115,54 +1112,6 @@ struct bch_fs { extern struct wait_queue_head bch2_read_only_wait; -static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - atomic_long_inc(&c->writes[ref]); -#else - percpu_ref_get(&c->writes); -#endif -} - -static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_going_ro, &c->flags) && - atomic_long_inc_not_zero(&c->writes[ref]); -#else - return percpu_ref_tryget(&c->writes); -#endif -} - -static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_going_ro, &c->flags) && - atomic_long_inc_not_zero(&c->writes[ref]); -#else - return percpu_ref_tryget_live(&c->writes); -#endif -} - -static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - long v = atomic_long_dec_return(&c->writes[ref]); - - BUG_ON(v < 0); - if (v) - return; - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) - if (atomic_long_read(&c->writes[i])) - return; - - set_bit(BCH_FS_write_disable_complete, &c->flags); - wake_up(&bch2_read_only_wait); -#else - percpu_ref_put(&c->writes); -#endif -} - static inline bool bch2_ro_ref_tryget(struct bch_fs *c) { if (test_bit(BCH_FS_stopping, &c->flags)) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 1f02d28c175c..2e72784332ff 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -22,6 +22,7 @@ #include "debug.h" #include "disk_accounting.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -1256,14 +1257,14 @@ static void bch2_gc_gens_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); bch2_gc_gens(c); - bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); } void bch2_gc_gens_async(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && + if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) && !queue_work(c->write_ref_wq, &c->gc_gens_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); } void bch2_fs_btree_gc_init_early(struct bch_fs *c) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index c1c671e340c7..9e759d9e29b1 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -13,6 +13,7 @@ #include "buckets.h" #include "checksum.h" #include "debug.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "io_write.h" @@ -1931,7 +1932,7 @@ static void btree_node_scrub_work(struct work_struct *work) btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); percpu_ref_put(&scrub->ca->io_ref[READ]); kfree(scrub); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); } static void btree_node_scrub_endio(struct bio *bio) @@ -1950,7 +1951,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, struct bch_fs *c = trans->c; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) return -BCH_ERR_erofs_no_writes; struct extent_ptr_decoded pick; @@ -2000,7 +2001,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); percpu_ref_put(&ca->io_ref[READ]); err: - bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); return ret; } diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 4297d8b5eddd..cdde769e7da3 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -11,6 +11,7 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "disk_accounting.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "journal.h" @@ -994,7 +995,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && - unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { + unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) ret = do_bch2_trans_commit_to_journal_replay(trans); else @@ -1060,7 +1061,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trace_and_count(c, transaction_commit, trans, _RET_IP_); out: if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) - bch2_write_ref_put(c, BCH_WRITE_REF_trans); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans); out_reset: if (!ret) bch2_trans_downgrade(trans); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 3155b4360fbc..3d25c2be035e 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -14,6 +14,7 @@ #include "btree_locking.h" #include "buckets.h" #include "clock.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "io_write.h" @@ -2341,7 +2342,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) closure_wake_up(&c->btree_node_rewrites_wait); bch2_bkey_buf_exit(&a->key, c); - bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2363,7 +2364,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) spin_lock(&c->btree_node_rewrites_lock); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && - bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { list_add(&a->list, &c->btree_node_rewrites); now = true; } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { @@ -2402,7 +2403,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c) if (!a) break; - bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite); queue_work(c->btree_node_rewrite_worker, &a->work); } } diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 68ab48af40f0..0094e4342b69 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -7,6 +7,7 @@ #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "disk_accounting.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -629,11 +630,11 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) { struct bch_fs *c = trans->c; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) return -BCH_ERR_erofs_no_writes; int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); return ret; } @@ -692,7 +693,7 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work) } while (!ret && bch2_btree_write_buffer_should_flush(c)); mutex_unlock(&wb->flushing.lock); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); } static void wb_accounting_sort(struct btree_write_buffer *wb) @@ -821,9 +822,9 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ bch2_journal_pin_drop(&c->journal, &dst->wb->pin); if (bch2_btree_write_buffer_should_flush(c) && - __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && + __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); if (dst->wb == &wb->flushing) mutex_unlock(&wb->flushing.lock); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 37e63137041c..94c24f4582bd 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -16,6 +16,7 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "io_read.h" #include "io_write.h" @@ -1011,14 +1012,14 @@ static void ec_stripe_delete_work(struct work_struct *work) BCH_TRANS_COMMIT_no_enospc, ({ ec_stripe_delete(trans, lru_k.k->p.offset); }))); - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); } void bch2_do_stripe_deletes(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && + if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) && !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); } /* stripe creation: */ @@ -1412,15 +1413,15 @@ static void ec_stripe_create_work(struct work_struct *work) while ((s = get_pending_stripe(c))) ec_stripe_create(s); - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); } void bch2_ec_do_stripe_creates(struct bch_fs *c) { - bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create); if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); } static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 535bc5fcbcc0..1f5154d9676b 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "enumerated_ref.h" #include "fs.h" #include "fs-io.h" #include "fs-io-direct.h" @@ -401,7 +402,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); - bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); @@ -606,7 +607,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) prefetch(&inode->ei_inode); prefetch((void *) &inode->ei_inode + 64); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write)) return -EROFS; inode_lock(&inode->v); @@ -675,7 +676,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) bio_put(bio); inode_dio_end(&inode->v); err_put_write_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); goto out; } diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 7200ec00128d..6ea2762e6517 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -7,6 +7,7 @@ #include "btree_update.h" #include "buckets.h" #include "clock.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "extent_update.h" @@ -219,7 +220,7 @@ static int bch2_flush_inode(struct bch_fs *c, if (c->opts.journal_flush_disabled) return 0; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) return -EROFS; u64 seq; @@ -227,7 +228,7 @@ static int bch2_flush_inode(struct bch_fs *c, bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); - bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); return ret; } @@ -818,7 +819,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, struct bch_fs *c = inode->v.i_sb->s_fs_info; long ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) return -EROFS; inode_lock(&inode->v); @@ -842,7 +843,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, err: bch2_pagecache_block_put(inode); inode_unlock(&inode->v); - bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); return bch2_err_class(ret); } diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index e490f136d63d..baedfee67399 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -17,6 +17,7 @@ #include "data_update.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "io_read.h" #include "io_misc.h" @@ -178,7 +179,7 @@ static noinline void promote_free(struct bch_read_bio *rbio) bch2_data_update_exit(&op->write); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } @@ -243,7 +244,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, return NULL; } - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); @@ -288,7 +289,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); err_put: - bch2_write_ref_put(c, BCH_WRITE_REF_promote); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); return ERR_PTR(ret); } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 38086c1a8e28..e95a535ad44a 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -15,6 +15,7 @@ #include "compress.h" #include "debug.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "extent_update.h" #include "inode.h" @@ -531,7 +532,7 @@ static void bch2_write_done(struct closure *cl) bch2_disk_reservation_put(c, &op->res); if (!(op->flags & BCH_WRITE_move)) - bch2_write_ref_put(c, BCH_WRITE_REF_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); EBUG_ON(cl->parent); @@ -1679,7 +1680,7 @@ CLOSURE_CALLBACK(bch2_write) } if (!(op->flags & BCH_WRITE_move) && - !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { + !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { op->error = -BCH_ERR_erofs_no_writes; goto err; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index e1cd6e8e37cf..e2c95192a577 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -12,6 +12,7 @@ #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" +#include "enumerated_ref.h" #include "error.h" #include "journal.h" #include "journal_io.h" @@ -989,11 +990,11 @@ int bch2_journal_meta(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) return -BCH_ERR_erofs_no_writes; int ret = __bch2_journal_meta(j); - bch2_write_ref_put(c, BCH_WRITE_REF_journal); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); return ret; } diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 710178e3da4c..3a13dbcab6ba 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -3,6 +3,7 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "inode.h" @@ -610,7 +611,7 @@ s64 bch2_remap_range(struct bch_fs *c, !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) return -BCH_ERR_erofs_no_writes; bch2_check_set_feature(c, BCH_FEATURE_reflink); @@ -761,7 +762,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); - bch2_write_ref_put(c, BCH_WRITE_REF_reflink); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink); return dst_done ?: ret ?: ret2; } diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 2eede851572c..14ea09ccee37 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -6,6 +6,7 @@ #include "btree_key_cache.h" #include "btree_update.h" #include "buckets.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -1661,18 +1662,18 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); bch2_delete_dead_snapshots(c); - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); } void bch2_delete_dead_snapshots_async(struct bch_fs *c) { - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) return; BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); } int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index c9d7209f0cb1..ff20ce98a476 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -517,7 +518,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor darray_exit(&s); } - bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); } struct subvolume_unlink_hook { @@ -540,11 +541,11 @@ static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans if (ret) return ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache)) return -EROFS; if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); return 0; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index cdcfed4dd283..6fa427d5cbd6 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -28,6 +28,7 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -311,15 +312,13 @@ static void __bch2_fs_read_only(struct bch_fs *c) } } -#ifndef BCH_WRITE_REF_DEBUG -static void bch2_writes_disabled(struct percpu_ref *writes) +static void bch2_writes_disabled(struct enumerated_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); } -#endif void bch2_fs_read_only(struct bch_fs *c) { @@ -337,12 +336,7 @@ void bch2_fs_read_only(struct bch_fs *c) * writes will return -EROFS: */ set_bit(BCH_FS_going_ro, &c->flags); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_kill(&c->writes); -#else - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) - bch2_write_ref_put(c, i); -#endif + enumerated_ref_stop_async(&c->writes); /* * If we're not doing an emergency shutdown, we want to wait on @@ -504,14 +498,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) set_bit(BCH_FS_rw, &c->flags); set_bit(BCH_FS_was_rw, &c->flags); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_reinit(&c->writes); -#else - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { - BUG_ON(atomic_long_read(&c->writes[i])); - atomic_long_inc(&c->writes[i]); - } -#endif + enumerated_ref_start(&c->writes); ret = bch2_copygc_start(c); if (ret) { @@ -619,9 +606,7 @@ static void __bch2_fs_free(struct bch_fs *c) mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->fill_iter); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_exit(&c->writes); -#endif + enumerated_ref_exit(&c->writes); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); @@ -949,10 +934,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts, if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_init(&c->writes, bch2_writes_disabled, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -#endif + enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, + bch2_writes_disabled) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->btree_bio, 1, max(offsetof(struct btree_read_bio, bio), diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index bfdadeae970e..b80c46af13d4 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -25,6 +25,7 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "inode.h" #include "journal.h" #include "journal_reclaim.h" @@ -175,8 +176,6 @@ read_attribute(btree_reserve_cache); read_attribute(open_buckets); read_attribute(open_buckets_partial); read_attribute(nocow_lock_table); - -#ifdef BCH_WRITE_REF_DEBUG read_attribute(write_refs); static const char * const bch2_write_refs[] = { @@ -186,15 +185,6 @@ static const char * const bch2_write_refs[] = { NULL }; -static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) -{ - bch2_printbuf_tabstop_push(out, 24); - - for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) - prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i])); -} -#endif - read_attribute(internal_uuid); read_attribute(disk_groups); @@ -369,10 +359,8 @@ SHOW(bch2_fs) if (attr == &sysfs_moving_ctxts) bch2_fs_moving_ctxts_to_text(out, c); -#ifdef BCH_WRITE_REF_DEBUG if (attr == &sysfs_write_refs) - bch2_write_refs_to_text(out, c); -#endif + enumerated_ref_to_text(out, &c->writes, bch2_write_refs); if (attr == &sysfs_nocow_lock_table) bch2_nocow_locks_to_text(out, &c->nocow_locks); @@ -405,7 +393,7 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_btree_updates) queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)) return -EROFS; if (attr == &sysfs_trigger_btree_cache_shrink) { @@ -465,7 +453,7 @@ STORE(bch2_fs) size = ret; } #endif - bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); return size; } SYSFS_OPS(bch2_fs); @@ -558,9 +546,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_new_stripes, &sysfs_open_buckets, &sysfs_open_buckets_partial, -#ifdef BCH_WRITE_REF_DEBUG &sysfs_write_refs, -#endif &sysfs_nocow_lock_table, &sysfs_io_timers_read, &sysfs_io_timers_write, @@ -626,7 +612,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, * We don't need to take c->writes for correctness, but it eliminates an * unsightly error message in the dmesg log when we're RO: */ - if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) + if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))) return -EROFS; char *tmp = kstrdup(buf, GFP_KERNEL); @@ -653,7 +639,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, ret = size; err: - bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); return ret; } From cca2c0d224c17c99fb2ee7674284f89ce8389f3a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 18 Apr 2025 21:54:12 -0400 Subject: [PATCH 078/218] bcachefs: bch_dev.io_ref -> enumerated_ref Convert device IO refs to enumerated_refs, for easier debugging of refcount issues. Simple conversion: enumerate all users and convert to the new helpers. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 18 +++---- fs/bcachefs/backpointers.c | 6 ++- fs/bcachefs/bcachefs.h | 48 +++++++++++++++++- fs/bcachefs/btree_io.c | 22 ++++---- fs/bcachefs/btree_node_scan.c | 10 ++-- fs/bcachefs/buckets.c | 4 +- fs/bcachefs/debug.c | 12 +++-- fs/bcachefs/ec.c | 19 ++++--- fs/bcachefs/fs-io.c | 6 ++- fs/bcachefs/io_read.c | 10 ++-- fs/bcachefs/io_write.c | 15 ++++-- fs/bcachefs/journal.c | 5 +- fs/bcachefs/journal_io.c | 15 +++--- fs/bcachefs/journal_reclaim.c | 2 +- fs/bcachefs/sb-members.h | 32 ++++++------ fs/bcachefs/super-io.c | 18 +++---- fs/bcachefs/super.c | 93 ++++++++++++++++++---------------- fs/bcachefs/super.h | 3 ++ fs/bcachefs/sysfs.c | 18 ++++--- 19 files changed, 222 insertions(+), 134 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index ced31309c541..c63348c4b874 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1953,7 +1953,7 @@ static void bch2_do_discards_work(struct work_struct *work) trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } @@ -1964,13 +1964,13 @@ void bch2_dev_do_discards(struct bch_dev *ca) if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) goto put_write_ref; if (queue_work(c->write_ref_wq, &ca->discard_work)) return; - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); put_write_ref: enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } @@ -2048,7 +2048,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); } @@ -2062,13 +2062,13 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast)) goto put_ref; if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) return; - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); put_ref: enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); } @@ -2262,8 +2262,8 @@ static void bch2_do_invalidates_work(struct work_struct *work) bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref[WRITE]); bch2_bkey_buf_exit(&last_flushed, c); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); } @@ -2274,13 +2274,13 @@ void bch2_dev_do_invalidates(struct bch_dev *ca) if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates)) goto put_ref; if (queue_work(c->write_ref_wq, &ca->invalidate_work)) return; - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); put_ref: enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); } diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 5f195d2280a4..e6178eb2c396 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -478,7 +478,8 @@ static int check_extent_checksum(struct btree_trans *trans, bytes = p.crc.compressed_size << 9; - struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ, + BCH_DEV_READ_REF_check_extent_checksums); if (!ca) return false; @@ -515,7 +516,8 @@ static int check_extent_checksum(struct btree_trans *trans, if (bio) bio_put(bio); kvfree(data_buf); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_check_extent_checksums); printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index ced80d4b606a..3d18dbe0d6f5 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -516,6 +516,51 @@ struct discard_in_flight { u64 bucket:63; }; +#define BCH_DEV_READ_REFS() \ + x(bch2_online_devs) \ + x(trans_mark_dev_sbs) \ + x(read_fua_test) \ + x(sb_field_resize) \ + x(write_super) \ + x(journal_read) \ + x(fs_journal_alloc) \ + x(fs_resize_on_mount) \ + x(btree_node_read) \ + x(btree_node_read_all_replicas) \ + x(btree_node_scrub) \ + x(btree_node_write) \ + x(btree_node_scan) \ + x(btree_verify_replicas) \ + x(btree_node_ondisk_to_text) \ + x(io_read) \ + x(check_extent_checksums) \ + x(ec_block) + +enum bch_dev_read_ref { +#define x(n) BCH_DEV_READ_REF_##n, + BCH_DEV_READ_REFS() +#undef x + BCH_DEV_READ_REF_NR, +}; + +#define BCH_DEV_WRITE_REFS() \ + x(journal_write) \ + x(journal_do_discards) \ + x(dev_do_discards) \ + x(discard_one_bucket_fast) \ + x(do_invalidates) \ + x(nocow_flush) \ + x(io_write) \ + x(ec_block) \ + x(ec_bucket_zero) + +enum bch_dev_write_ref { +#define x(n) BCH_DEV_WRITE_REF_##n, + BCH_DEV_WRITE_REFS() +#undef x + BCH_DEV_WRITE_REF_NR, +}; + struct bch_dev { struct kobject kobj; #ifdef CONFIG_BCACHEFS_DEBUG @@ -526,8 +571,7 @@ struct bch_dev { struct percpu_ref ref; #endif struct completion ref_completion; - struct percpu_ref io_ref[2]; - struct completion io_ref_completion[2]; + struct enumerated_ref io_ref[2]; struct bch_fs *fs; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 9e759d9e29b1..8fe9e0fc6629 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1326,7 +1326,7 @@ static void btree_node_read_work(struct work_struct *work) while (1) { retry = true; bch_info(c, "retrying read"); - ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); + ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = ca != NULL; rb->start_time = local_clock(); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); @@ -1351,7 +1351,7 @@ static void btree_node_read_work(struct work_struct *work) "btree read error %s for %s", bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = false; bch2_mark_io_failure(&failed, &rb->pick, false); @@ -1609,7 +1609,8 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_read_all_replicas); } ra->err[rb->idx] = bio->bi_status; @@ -1649,7 +1650,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool i = 0; bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_read_all_replicas); struct btree_read_bio *rb = container_of(ra->bio[i], struct btree_read_bio, bio); rb->c = c; @@ -1727,7 +1729,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, return; } - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); bio = bio_alloc_bioset(NULL, buf_pages(b->data, btree_buf_bytes(b)), @@ -1930,7 +1932,7 @@ static void btree_node_scrub_work(struct work_struct *work) printbuf_exit(&err); bch2_bkey_buf_exit(&scrub->key, c);; btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); - percpu_ref_put(&scrub->ca->io_ref[READ]); + enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); kfree(scrub); enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); } @@ -1959,7 +1961,8 @@ int bch2_btree_node_scrub(struct btree_trans *trans, if (ret <= 0) goto err; - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_scrub); if (!ca) { ret = -BCH_ERR_device_offline; goto err; @@ -1999,7 +2002,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, return 0; err_free: btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); err: enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); return ret; @@ -2169,7 +2172,8 @@ static void btree_node_write_endio(struct bio *bio) * btree writes yet (due to device removal/ro): */ if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_write); if (parent) { bio_put(bio); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 81ee7ae88a77..7bd13438d5ef 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -271,7 +271,7 @@ static int read_btree_nodes_worker(void *p) err: bio_put(bio); free_page((unsigned long) buf); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); closure_put(w->cl); kfree(w); return 0; @@ -285,13 +285,13 @@ static int read_btree_nodes(struct find_btree_nodes *f) closure_init_stack(&cl); - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) continue; struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); if (!w) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); ret = -ENOMEM; goto err; } @@ -303,14 +303,14 @@ static int read_btree_nodes(struct find_btree_nodes *f) struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); kfree(w); bch_err_msg(c, ret, "starting kthread"); break; } closure_get(&cl); - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); wake_up_process(t); } err: diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 36c1e391d4df..3ec33a7e9d92 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1146,10 +1146,10 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, enum btree_iter_update_trigger_flags flags) { - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) { int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs); return ret; } } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 312f5ce7cba9..4cbb19c36fa1 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -42,7 +42,8 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct bio *bio; bool failed = false, saw_error = false; - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_verify_replicas); if (!ca) return false; @@ -57,7 +58,8 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, submit_bio_wait(bio); bio_put(bio); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_verify_replicas); memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); @@ -196,7 +198,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_ondisk_to_text); if (!ca) { prt_printf(out, "error getting device to read from: not online\n"); return; @@ -297,7 +300,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, if (bio) bio_put(bio); kvfree(n_ondisk); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_ondisk_to_text); } #ifdef CONFIG_DEBUG_FS diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 94c24f4582bd..dcd4e2266d34 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -701,6 +701,9 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; int rw = ec_bio->rw; + unsigned ref = rw == READ + ? BCH_DEV_READ_REF_ec_block + : BCH_DEV_WRITE_REF_ec_block; bch2_account_io_completion(ca, bio_data_dir(bio), ec_bio->submit_time, !bio->bi_status); @@ -722,7 +725,7 @@ static void ec_block_endio(struct bio *bio) } bio_put(&ec_bio->bio); - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref); closure_put(cl); } @@ -736,8 +739,11 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ? BCH_DATA_user : BCH_DATA_parity; int rw = op_is_write(opf); + unsigned ref = rw == READ + ? BCH_DEV_READ_REF_ec_block + : BCH_DEV_WRITE_REF_ec_block; - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref); if (!ca) { clear_bit(idx, buf->valid); return; @@ -783,14 +789,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); closure_get(cl); - percpu_ref_get(&ca->io_ref[rw]); + enumerated_ref_get(&ca->io_ref[rw], ref); submit_bio(&ec_bio->bio); offset += b; } - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref); } static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, @@ -1247,7 +1253,8 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, unsigned block, struct open_bucket *ob) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); + struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, + BCH_DEV_WRITE_REF_ec_bucket_zero); if (!ca) { s->err = -BCH_ERR_erofs_no_writes; return; @@ -1263,7 +1270,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, ob->sectors_free, GFP_KERNEL, 0); - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero); if (ret) s->err = ret; diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 6ea2762e6517..b1e9ee28fc0f 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -49,7 +49,8 @@ static void nocow_flush_endio(struct bio *_bio) struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); closure_put(bio->cl); - percpu_ref_put(&bio->ca->io_ref[WRITE]); + enumerated_ref_put(&bio->ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush); bio_put(&bio->bio); } @@ -72,7 +73,8 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { rcu_read_lock(); ca = rcu_dereference(c->devs[dev]); - if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE])) + if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush)) ca = NULL; rcu_read_unlock(); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index baedfee67399..136b6d54a2c2 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -409,7 +409,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) if (rbio->have_ioref) { struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); } if (rbio->split) { @@ -1100,7 +1100,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto err; } - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_io_read); /* * Stale dirty pointers are treated as IO errors, but @failed isn't @@ -1114,7 +1115,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); bch2_mark_io_failure(failed, &pick, false); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); goto retry_pick; } @@ -1147,7 +1148,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, */ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_io_read); rbio->ret = -BCH_ERR_data_read_buffer_too_small; goto out_read_done; } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index e95a535ad44a..add141ac45b5 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -461,6 +461,10 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); struct bch_write_bio *n; + unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE; + unsigned ref_idx = type == BCH_DATA_btree + ? BCH_DEV_READ_REF_btree_node_write + : BCH_DEV_WRITE_REF_io_write; BUG_ON(c->opts.nochanges); @@ -472,7 +476,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, */ struct bch_dev *ca = nocow ? bch2_dev_have_ref(c, ptr->dev) - : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); + : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); if (to_entry(ptr + 1) < ptrs.end) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); @@ -747,7 +751,8 @@ static void bch2_write_endio(struct bio *bio) } if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -1344,7 +1349,8 @@ static void bch2_nocow_write(struct bch_write_op *op) /* Get iorefs before dropping btree locks: */ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, + BCH_DEV_WRITE_REF_io_write); if (unlikely(!ca)) goto err_get_ioref; @@ -1446,7 +1452,8 @@ static void bch2_nocow_write(struct bch_write_op *op) return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]); + enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); /* Fall back to COW path: */ goto out; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index e2c95192a577..f2963a6cca88 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1336,13 +1336,14 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) int bch2_fs_journal_alloc(struct bch_fs *c) { - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) { if (ca->journal.nr) continue; int ret = bch2_dev_journal_alloc(ca, true); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_journal_alloc); return ret; } } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 438ad32ba242..8f38e9485cd8 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1219,7 +1219,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); kvfree(buf.data); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); closure_return(cl); return; err: @@ -1254,7 +1254,8 @@ int bch2_journal_read(struct bch_fs *c, if ((ca->mi.state == BCH_MEMBER_STATE_rw || ca->mi.state == BCH_MEMBER_STATE_ro) && - percpu_ref_tryget(&ca->io_ref[READ])) + enumerated_ref_tryget(&ca->io_ref[READ], + BCH_DEV_READ_REF_journal_read)) closure_call(&ca->journal.read, bch2_journal_read_device, system_unbound_wq, @@ -1770,7 +1771,7 @@ static void journal_write_endio(struct bio *bio) } closure_put(&w->io); - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); } static CLOSURE_CALLBACK(journal_write_submit) @@ -1781,7 +1782,8 @@ static CLOSURE_CALLBACK(journal_write_submit) unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, + BCH_DEV_WRITE_REF_journal_write); if (!ca) { /* XXX: fix this */ bch_err(c, "missing device %u for journal write", ptr->dev); @@ -1844,8 +1846,9 @@ static CLOSURE_CALLBACK(journal_write_preflush) } if (w->separate_flush) { - for_each_rw_member(c, ca) { - percpu_ref_get(&ca->io_ref[WRITE]); + for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { + enumerated_ref_get(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_journal_write); struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio[w->idx]->bio; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index dc8169a970dd..bb339be54e7b 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -295,7 +295,7 @@ void bch2_journal_do_discards(struct journal *j) mutex_lock(&j->discard_lock); - for_each_rw_member(c, ca) { + for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) { struct journal_device *ja = &ca->journal; while (should_discard_bucket(j, ja)) { diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index c71a1ba61525..c9cb8f7657b0 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -4,6 +4,7 @@ #include "darray.h" #include "bkey_types.h" +#include "enumerated_ref.h" extern char * const bch2_member_error_strs[]; @@ -20,7 +21,7 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); static inline bool bch2_dev_is_online(struct bch_dev *ca) { - return !percpu_ref_is_zero(&ca->io_ref[READ]); + return !enumerated_ref_is_zero(&ca->io_ref[READ]); } static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); @@ -163,33 +164,33 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, struct bch_dev *ca, unsigned state_mask, - int rw) + int rw, unsigned ref_idx) { rcu_read_lock(); if (ca) - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref_idx); while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || - !percpu_ref_tryget(&ca->io_ref[rw]))) + !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) ; rcu_read_unlock(); return ca; } -#define __for_each_online_member(_c, _ca, state_mask, rw) \ +#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \ for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw));) + (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));) -#define for_each_online_member(c, ca) \ - __for_each_online_member(c, ca, ~0, READ) +#define for_each_online_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, ~0, READ, ref_idx) -#define for_each_rw_member(c, ca) \ - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE) +#define for_each_rw_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx) -#define for_each_readable_member(c, ca) \ - __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ) +#define for_each_readable_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx) static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) { @@ -293,13 +294,14 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev return bch2_dev_tryget(c, dev_idx); } -static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) +static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, + int rw, unsigned ref_idx) { might_sleep(); rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (ca && !percpu_ref_tryget(&ca->io_ref[rw])) + if (ca && !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) ca = NULL; rcu_read_unlock(); @@ -309,7 +311,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, return ca; if (ca) - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref_idx); return NULL; } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 872707e5fa95..d53cbc5f9925 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -260,11 +260,11 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, /* XXX: we're not checking that offline device have enough space */ - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) { struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize); return NULL; } } @@ -967,7 +967,7 @@ static void write_super_endio(struct bio *bio) } closure_put(&ca->fs->sb_write); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); } static void read_back_super(struct bch_fs *c, struct bch_dev *ca) @@ -985,7 +985,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); closure_bio_submit(bio, &c->sb_write); } @@ -1011,7 +1011,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); closure_bio_submit(bio, &c->sb_write); } @@ -1043,13 +1043,13 @@ int bch2_write_super(struct bch_fs *c) * For now, we expect to be able to call write_super() when we're not * yet RW: */ - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) { ret = darray_push(&online_devices, ca); if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); goto out; } - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); } /* Make sure we're using the new magic numbers: */ @@ -1216,7 +1216,7 @@ int bch2_write_super(struct bch_fs *c) /* Make new options visible after they're persistent: */ bch2_sb_update(c); darray_for_each(online_devices, ca) - percpu_ref_put(&(*ca)->io_ref[READ]); + enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super); darray_exit(&online_devices); printbuf_exit(&err); return ret; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6fa427d5cbd6..bed0f8a80212 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -78,13 +78,28 @@ MODULE_DESCRIPTION("bcachefs filesystem"); typedef DARRAY(struct bch_sb_handle) bch_sb_handles; -const char * const bch2_fs_flag_strs[] = { #define x(n) #n, +const char * const bch2_fs_flag_strs[] = { BCH_FS_FLAGS() -#undef x NULL }; +const char * const bch2_write_refs[] = { + BCH_WRITE_REFS() + NULL +}; + +const char * const bch2_dev_read_refs[] = { + BCH_DEV_READ_REFS() + NULL +}; + +const char * const bch2_dev_write_refs[] = { + BCH_DEV_WRITE_REFS() + NULL +}; +#undef x + static void __bch2_print_str(struct bch_fs *c, const char *prefix, const char *str, bool nonblocking) { @@ -469,7 +484,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) for_each_online_member_rcu(c, ca) if (ca->mi.state == BCH_MEMBER_STATE_rw) { bch2_dev_allocator_add(c, ca); - percpu_ref_reinit(&ca->io_ref[WRITE]); + enumerated_ref_start(&ca->io_ref[WRITE]); } rcu_read_unlock(); @@ -645,6 +660,12 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_read_only(c); up_write(&c->state_lock); + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); + if (ca) + bch2_dev_io_ref_stop(ca, READ); + } + for_each_member_device(c, ca) bch2_dev_unlink(ca); @@ -673,8 +694,6 @@ void __bch2_fs_stop(struct bch_fs *c) void bch2_fs_free(struct bch_fs *c) { - unsigned i; - mutex_lock(&bch_fs_list_lock); list_del(&c->list); mutex_unlock(&bch_fs_list_lock); @@ -682,7 +701,7 @@ void bch2_fs_free(struct bch_fs *c) closure_sync(&c->cl); closure_debug_destroy(&c->cl); - for (i = 0; i < c->sb.nr_devices; i++) { + for (unsigned i = 0; i < c->sb.nr_devices; i++) { struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); if (ca) { @@ -1290,11 +1309,11 @@ static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) if (rw == READ) clear_bit(ca->dev_idx, ca->fs->online_devs.d); - if (!percpu_ref_is_zero(&ca->io_ref[rw])) { - reinit_completion(&ca->io_ref_completion[rw]); - percpu_ref_kill(&ca->io_ref[rw]); - wait_for_completion(&ca->io_ref_completion[rw]); - } + if (!enumerated_ref_is_zero(&ca->io_ref[rw])) + enumerated_ref_stop(&ca->io_ref[rw], + rw == READ + ? bch2_dev_read_refs + : bch2_dev_write_refs); } static void bch2_dev_release(struct kobject *kobj) @@ -1306,8 +1325,8 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { - WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); - WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); + WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); + WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); cancel_work_sync(&ca->io_error_work); @@ -1327,8 +1346,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); - percpu_ref_exit(&ca->io_ref[WRITE]); - percpu_ref_exit(&ca->io_ref[READ]); + enumerated_ref_exit(&ca->io_ref[WRITE]); + enumerated_ref_exit(&ca->io_ref[READ]); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); #endif @@ -1340,7 +1359,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) lockdep_assert_held(&c->state_lock); - if (percpu_ref_is_zero(&ca->io_ref[READ])) + if (enumerated_ref_is_zero(&ca->io_ref[READ])) return; __bch2_dev_read_only(c, ca); @@ -1362,20 +1381,6 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref) } #endif -static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]); - - complete(&ca->io_ref_completion[READ]); -} - -static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]); - - complete(&ca->io_ref_completion[WRITE]); -} - static void bch2_dev_unlink(struct bch_dev *ca) { struct kobject *b; @@ -1437,8 +1442,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, kobject_init(&ca->kobj, &bch2_dev_ktype); init_completion(&ca->ref_completion); - init_completion(&ca->io_ref_completion[READ]); - init_completion(&ca->io_ref_completion[WRITE]); INIT_WORK(&ca->io_error_work, bch2_io_error_work); @@ -1464,10 +1467,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, bch2_dev_allocator_background_init(ca); - if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || + enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || !(ca->io_done = alloc_percpu(*ca->io_done))) @@ -1529,8 +1530,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) return -BCH_ERR_device_size_too_small; } - BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); - BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); + BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); + BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); ret = bch2_dev_journal_init(ca, sb->sb); if (ret) @@ -1549,7 +1550,7 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->dev = ca->disk_sb.bdev->bd_dev; - percpu_ref_reinit(&ca->io_ref[READ]); + enumerated_ref_start(&ca->io_ref[READ]); return 0; } @@ -1662,8 +1663,8 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - if (percpu_ref_is_zero(&ca->io_ref[WRITE])) - percpu_ref_reinit(&ca->io_ref[WRITE]); + if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) + enumerated_ref_start(&ca->io_ref[WRITE]); bch2_dev_do_discards(ca); } @@ -1813,7 +1814,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) err: if (test_bit(BCH_FS_rw, &c->flags) && ca->mi.state == BCH_MEMBER_STATE_rw && - !percpu_ref_is_zero(&ca->io_ref[READ])) + !enumerated_ref_is_zero(&ca->io_ref[READ])) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return ret; @@ -2112,7 +2113,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) int bch2_fs_resize_on_mount(struct bch_fs *c) { - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { u64 old_nbuckets = ca->mi.nbuckets; u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), ca->mi.bucket_size); @@ -2123,7 +2124,8 @@ int bch2_fs_resize_on_mount(struct bch_fs *c) int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); bch_err_fn(ca, ret); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_resize_on_mount); up_write(&c->state_lock); return ret; } @@ -2141,7 +2143,8 @@ int bch2_fs_resize_on_mount(struct bch_fs *c) if (ca->mi.freespace_initialized) { ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_resize_on_mount); up_write(&c->state_lock); return ret; } diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 50588ab20be2..a1566f2d77c3 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -9,6 +9,9 @@ #include extern const char * const bch2_fs_flag_strs[]; +extern const char * const bch2_write_refs[]; +extern const char * const bch2_dev_read_refs[]; +extern const char * const bch2_dev_write_refs[]; struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index b80c46af13d4..dfae5eda7a4c 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -176,14 +176,9 @@ read_attribute(btree_reserve_cache); read_attribute(open_buckets); read_attribute(open_buckets_partial); read_attribute(nocow_lock_table); -read_attribute(write_refs); -static const char * const bch2_write_refs[] = { -#define x(n) #n, - BCH_WRITE_REFS() -#undef x - NULL -}; +read_attribute(read_refs); +read_attribute(write_refs); read_attribute(internal_uuid); read_attribute(disk_groups); @@ -790,6 +785,12 @@ SHOW(bch2_dev) if (opt_id >= 0) return sysfs_opt_show(c, ca, opt_id, out); + if (attr == &sysfs_read_refs) + enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs); + + if (attr == &sysfs_write_refs) + enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs); + return 0; } @@ -845,6 +846,9 @@ struct attribute *bch2_dev_files[] = { /* debug: */ &sysfs_alloc_debug, &sysfs_open_buckets, + + &sysfs_read_refs, + &sysfs_write_refs, NULL }; From 5f0de475f967a094bd596913ffbe9ad9b33b4e3a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 21 Apr 2025 12:04:10 -0400 Subject: [PATCH 079/218] bcachefs: bch2_bio_to_text() Pretty printer for struct bio, to be used for async object debugging. This is pretty minimal, we'll add more to it as we discover what we need. Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 10 ++++++++++ fs/bcachefs/util.h | 2 ++ 2 files changed, 12 insertions(+) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 6e5d7fc265bd..7e6ebe8cd9ea 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -715,6 +715,16 @@ void bch2_corrupt_bio(struct bio *bio) } #endif +void bch2_bio_to_text(struct printbuf *out, struct bio *bio) +{ + prt_printf(out, "bi_remaining:\t%u\n", + atomic_read(&bio->__bi_remaining)); + prt_printf(out, "bi_end_io:\t%ps\n", + bio->bi_end_io); + prt_printf(out, "bi_status:\t%u\n", + bio->bi_status); +} + #if 0 void eytzinger1_test(void) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 50f7197c67fc..7a93e187a49a 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -419,6 +419,8 @@ static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) #define bch2_maybe_corrupt_bio(...) do {} while (0) #endif +void bch2_bio_to_text(struct printbuf *, struct bio *); + static inline void memcpy_u64s_small(void *dst, const void *src, unsigned u64s) { From 989b4c375a330c8f0cd18aa891c67ac56bec2984 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 2 Feb 2025 11:23:07 -0500 Subject: [PATCH 080/218] bcachefs: bch2_read_bio_to_text Pretty printer for struct bch_read_bio. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 18 +++++++++++++++--- fs/bcachefs/io_read.c | 35 +++++++++++++++++++++++++++++++++++ fs/bcachefs/io_read.h | 2 ++ 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index c3034338f9e4..9b44f11fb0d9 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -587,6 +587,10 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, prt_str_indented(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); + prt_newline(out); + + prt_str_indented(out, "scrub:\t"); + prt_u64(out, data_opts->scrub); } void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) @@ -607,9 +611,17 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update prt_newline(out); printbuf_indent_add(out, 2); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - prt_printf(out, "read_done:\t%u\n", m->read_done); - bch2_write_op_to_text(out, &m->op); - printbuf_indent_sub(out, 2); + + if (!m->read_done) { + prt_printf(out, "read:\n"); + printbuf_indent_add(out, 2); + bch2_read_bio_to_text(out, &m->rbio); + } else { + prt_printf(out, "write:\n"); + printbuf_indent_add(out, 2); + bch2_write_op_to_text(out, &m->op); + } + printbuf_indent_sub(out, 4); } int bch2_extent_drop_ptrs(struct btree_trans *trans, diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 136b6d54a2c2..df96e2c8ceda 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1487,6 +1487,41 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, return ret; } +static const char * const bch2_read_bio_flags[] = { +#define x(n) #n, + BCH_READ_FLAGS() +#undef x + NULL +}; + +void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) +{ + u64 now = local_clock(); + prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); + prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); + + if (!rbio->split) + prt_printf(out, "end_io:\t%ps\n", rbio->end_io); + else + prt_printf(out, "parent:\t%px\n", rbio->parent); + + prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); + + prt_printf(out, "promote:\t%u\n", rbio->promote); + prt_printf(out, "bounce:\t%u\n", rbio->bounce); + prt_printf(out, "split:\t%u\n", rbio->split); + prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); + prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); + prt_printf(out, "context:\t%u\n", rbio->context); + prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret)); + + prt_printf(out, "flags:\t"); + bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); + prt_newline(out); + + bch2_bio_to_text(out, &rbio->bio); +} + void bch2_fs_io_read_exit(struct bch_fs *c) { if (c->promote_table.tbl) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 1a85b092fd1d..13bb68eb91c4 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -193,6 +193,8 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, return rbio; } +void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *); + void bch2_fs_io_read_exit(struct bch_fs *); int bch2_fs_io_read_init(struct bch_fs *); From d49bafdc5d1659171d988888ebfc773629f8ca97 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 28 Sep 2024 16:22:38 -0400 Subject: [PATCH 081/218] bcachefs: fast_list A fast "list" data structure, which is actually a radix tree, with an IDA for slot allocation and a percpu buffer on top of that. Items cannot be added or moved to the head or tail, only added at some (arbitrary) position and removed. The advantage is that adding, removing and iteration is generally lockless, only hitting the lock in ida when the percpu buffer is full or empty. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/fast_list.c | 156 ++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/fast_list.h | 41 +++++++++++ 3 files changed, 198 insertions(+) create mode 100644 fs/bcachefs/fast_list.c create mode 100644 fs/bcachefs/fast_list.h diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index d2b8aec6ed8c..3be39845e4f6 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -41,6 +41,7 @@ bcachefs-y := \ extents.o \ extent_update.o \ eytzinger.o \ + fast_list.o \ fs.o \ fs-ioctl.o \ fs-io.o \ diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c new file mode 100644 index 000000000000..2faec143eb31 --- /dev/null +++ b/fs/bcachefs/fast_list.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Fast, unordered lists + * + * Supports add, remove, and iterate + * + * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot + * allocation and freeing. + * + * This means that adding, removing, and iterating over items is lockless, + * except when refilling/emptying the percpu slot buffers. + */ + +#include "fast_list.h" + +struct fast_list_pcpu { + u32 nr; + u32 entries[31]; +}; + +static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp) +{ + int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp); + if (unlikely(idx < 0)) + return 0; + + if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) { + ida_free(&l->slots_allocated, idx); + return 0; + } + + return idx; +} + +/** + * fast_list_get_idx - get a slot in a fast_list + * @l: list to get slot in + * + * This allocates a slot in the radix tree without storing to it, so that we can + * take the potential memory allocation failure early and do the list add later + * when we can't take an allocation failure. + * + * Returns: positive integer on success, -ENOMEM on failure + */ +int fast_list_get_idx(struct fast_list *l) +{ + unsigned long flags; + int idx; +retry: + local_irq_save(flags); + struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); + + if (unlikely(!lp->nr)) { + u32 entries[16], nr = 0; + + local_irq_restore(flags); + while (nr < ARRAY_SIZE(entries) && + (idx = fast_list_alloc_idx(l, GFP_KERNEL))) + entries[nr++] = idx; + local_irq_save(flags); + + lp = this_cpu_ptr(l->buffer); + + while (nr && lp->nr < ARRAY_SIZE(lp->entries)) + lp->entries[lp->nr++] = entries[--nr]; + + if (unlikely(nr)) { + local_irq_restore(flags); + while (nr) + ida_free(&l->slots_allocated, entries[--nr]); + goto retry; + } + + if (unlikely(!lp->nr)) { + local_irq_restore(flags); + return -ENOMEM; + } + } + + idx = lp->entries[--lp->nr]; + local_irq_restore(flags); + + return idx; +} + +/** + * fast_list_add - add an item to a fast_list + * @l: list + * @item: item to add + * + * Allocates a slot in the radix tree and stores to it and then returns the + * slot index, which must be passed to fast_list_remove(). + * + * Returns: positive integer on success, -ENOMEM on failure + */ +int fast_list_add(struct fast_list *l, void *item) +{ + int idx = fast_list_get_idx(l); + if (idx < 0) + return idx; + + *genradix_ptr_inlined(&l->items, idx) = item; + return idx; +} + +/** + * fast_list_remove - remove an item from a fast_list + * @l: list + * @idx: item's slot index + * + * Zeroes out the slot in the radix tree and frees the slot for future + * fast_list_add() operations. + */ +void fast_list_remove(struct fast_list *l, unsigned idx) +{ + u32 entries[16], nr = 0; + unsigned long flags; + + if (!idx) + return; + + *genradix_ptr_inlined(&l->items, idx) = NULL; + + local_irq_save(flags); + struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); + + if (unlikely(lp->nr == ARRAY_SIZE(lp->entries))) + while (nr < ARRAY_SIZE(entries)) + entries[nr++] = lp->entries[--lp->nr]; + + lp->entries[lp->nr++] = idx; + local_irq_restore(flags); + + if (unlikely(nr)) + while (nr) + ida_free(&l->slots_allocated, entries[--nr]); +} + +void fast_list_exit(struct fast_list *l) +{ + /* XXX: warn if list isn't empty */ + free_percpu(l->buffer); + ida_destroy(&l->slots_allocated); + genradix_free(&l->items); +} + +int fast_list_init(struct fast_list *l) +{ + genradix_init(&l->items); + ida_init(&l->slots_allocated); + l->buffer = alloc_percpu(*l->buffer); + if (!l->buffer) + return -ENOMEM; + return 0; +} diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h new file mode 100644 index 000000000000..73c9bf591fd6 --- /dev/null +++ b/fs/bcachefs/fast_list.h @@ -0,0 +1,41 @@ +#ifndef _LINUX_FAST_LIST_H +#define _LINUX_FAST_LIST_H + +#include +#include +#include + +struct fast_list_pcpu; + +struct fast_list { + GENRADIX(void *) items; + struct ida slots_allocated;; + struct fast_list_pcpu __percpu + *buffer; +}; + +static inline void *fast_list_iter_peek(struct genradix_iter *iter, + struct fast_list *list) +{ + void **p; + while ((p = genradix_iter_peek(iter, &list->items)) && !*p) + genradix_iter_advance(iter, &list->items); + + return p ? *p : NULL; +} + +#define fast_list_for_each_from(_list, _iter, _i, _start) \ + for (_iter = genradix_iter_init(&(_list)->items, _start); \ + (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \ + genradix_iter_advance(&(_iter), &(_list)->items)) + +#define fast_list_for_each(_list, _iter, _i) \ + fast_list_for_each_from(_list, _iter, _i, 0) + +int fast_list_get_idx(struct fast_list *l); +int fast_list_add(struct fast_list *l, void *item); +void fast_list_remove(struct fast_list *l, unsigned idx); +void fast_list_exit(struct fast_list *l); +int fast_list_init(struct fast_list *l); + +#endif /* _LINUX_FAST_LIST_H */ From 0499a82b18b5ddee0d97d2cfcae0c0120f858c1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 21 Apr 2025 12:01:50 -0400 Subject: [PATCH 082/218] bcachefs: Async object debugging Debugging infrastructure for async objs: this lets us easily create fast_lists for various object types so they'll be visible in debugfs. Add new object types to the BCH_ASYNC_OBJS_TYPES() enum, and drop a pretty-printer wrapper in async_objs.c. Signed-off-by: Kent Overstreet --- fs/bcachefs/Kconfig | 4 ++ fs/bcachefs/Makefile | 2 + fs/bcachefs/async_objs.c | 105 +++++++++++++++++++++++++++++++++ fs/bcachefs/async_objs.h | 44 ++++++++++++++ fs/bcachefs/async_objs_types.h | 20 +++++++ fs/bcachefs/bcachefs.h | 7 +++ fs/bcachefs/debug.c | 52 +++++++--------- fs/bcachefs/debug.h | 18 ++++++ fs/bcachefs/errcode.h | 1 + fs/bcachefs/super.c | 3 + 10 files changed, 224 insertions(+), 32 deletions(-) create mode 100644 fs/bcachefs/async_objs.c create mode 100644 fs/bcachefs/async_objs.h create mode 100644 fs/bcachefs/async_objs_types.h diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index a14e4a60b187..8cb2b9d5da96 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -107,6 +107,10 @@ config BCACHEFS_TRANS_KMALLOC_TRACE bool "Trace bch2_trans_kmalloc() calls" depends on BCACHEFS_FS +config BCACHEFS_ASYNC_OBJECT_LISTS + bool "Keep async objects on fast_lists for debugfs visibility" + depends on BCACHEFS_FS && DEBUG_FS + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 3be39845e4f6..93c8ee5425c8 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -99,6 +99,8 @@ bcachefs-y := \ varint.o \ xattr.o +bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o + obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o # Silence "note: xyz changed in GCC X.X" messages diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c new file mode 100644 index 000000000000..8d78f390a759 --- /dev/null +++ b/fs/bcachefs/async_objs.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Async obj debugging: keep asynchronous objects on (very fast) lists, make + * them visibile in debugfs: + */ + +#include "bcachefs.h" +#include "async_objs.h" +#include "btree_io.h" +#include "debug.h" +#include "io_read.h" + +#include + +static int bch2_async_obj_list_open(struct inode *inode, struct file *file) +{ + struct async_obj_list *list = inode->i_private; + struct dump_iter *i; + + i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); + if (!i) + return -ENOMEM; + + file->private_data = i; + i->from = POS_MIN; + i->iter = 0; + i->c = container_of(list, struct bch_fs, async_objs[list->idx]); + i->list = list; + i->buf = PRINTBUF; + return 0; +} + +static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct async_obj_list *list = i->list; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + struct genradix_iter iter; + void *obj; + fast_list_for_each_from(&list->list, iter, obj, i->iter) { + ret = bch2_debugfs_flush_buf(i); + if (ret) + return ret; + + if (!i->size) + break; + + list->obj_to_text(&i->buf, obj); + } + + if (i->buf.allocation_failure) + ret = -ENOMEM; + else + i->iter = iter.pos; + + if (!ret) + ret = bch2_debugfs_flush_buf(i); + + return ret ?: i->ret; +} + +__maybe_unused +static const struct file_operations async_obj_ops = { + .owner = THIS_MODULE, + .open = bch2_async_obj_list_open, + .release = bch2_dump_release, + .read = bch2_async_obj_list_read, +}; + +void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) +{ + c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir); + +#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \ + &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops); + BCH_ASYNC_OBJ_LISTS() +#undef x +} + +void bch2_fs_async_obj_exit(struct bch_fs *c) +{ + for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) + fast_list_exit(&c->async_objs[i].list); +} + +int bch2_fs_async_obj_init(struct bch_fs *c) +{ + for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) { + if (fast_list_init(&c->async_objs[i].list)) + return -BCH_ERR_ENOMEM_async_obj_init; + c->async_objs[i].idx = i; + } + +#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text; + BCH_ASYNC_OBJ_LISTS() +#undef x + + return 0; +} diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h new file mode 100644 index 000000000000..cd6489b8cf76 --- /dev/null +++ b/fs/bcachefs/async_objs.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ASYNC_OBJS_H +#define _BCACHEFS_ASYNC_OBJS_H + +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS +static inline void __async_object_list_del(struct fast_list *head, unsigned idx) +{ + fast_list_remove(head, idx); +} + +static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx) +{ + int ret = fast_list_add(head, obj); + *idx = ret > 0 ? ret : 0; + return ret < 0 ? ret : 0; +} + +#define async_object_list_del(_c, _list, idx) \ + __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx) + +#define async_object_list_add(_c, _list, obj, idx) \ + __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx) + +void bch2_fs_async_obj_debugfs_init(struct bch_fs *); +void bch2_fs_async_obj_exit(struct bch_fs *); +int bch2_fs_async_obj_init(struct bch_fs *); + +#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ + +#define async_object_list_del(_c, _n, idx) do {} while (0) + +static inline int __async_object_list_add(void) +{ + return 0; +} +#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add() + +static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {} +static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {} +static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; } + +#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ + +#endif /* _BCACHEFS_ASYNC_OBJS_H */ diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h new file mode 100644 index 000000000000..28cb73e3f56d --- /dev/null +++ b/fs/bcachefs/async_objs_types.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H +#define _BCACHEFS_ASYNC_OBJS_TYPES_H + +#define BCH_ASYNC_OBJ_LISTS() + +enum bch_async_obj_lists { +#define x(n) BCH_ASYNC_OBJ_LIST_##n, + BCH_ASYNC_OBJ_LISTS() +#undef x + BCH_ASYNC_OBJ_NR +}; + +struct async_obj_list { + struct fast_list list; + void (*obj_to_text)(struct printbuf *, void *); + unsigned idx; +}; + +#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 3d18dbe0d6f5..94e3edd932e3 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -209,6 +209,7 @@ #include "btree_journal_iter_types.h" #include "disk_accounting_types.h" #include "errcode.h" +#include "fast_list.h" #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" @@ -474,6 +475,7 @@ enum bch_time_stats { }; #include "alloc_types.h" +#include "async_objs_types.h" #include "btree_gc_types.h" #include "btree_types.h" #include "btree_node_scan_types.h" @@ -1027,6 +1029,10 @@ struct bch_fs { nocow_locks; struct rhashtable promote_table; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR]; +#endif + mempool_t compression_bounce[2]; mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; @@ -1115,6 +1121,7 @@ struct bch_fs { /* DEBUG JUNK */ struct dentry *fs_debug_dir; struct dentry *btree_debug_dir; + struct dentry *async_obj_dir; struct btree_debug btree_debug[BTREE_ID_NR]; struct btree *verify_data; struct btree_node *verify_ondisk; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 4cbb19c36fa1..079bc2b359cd 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_io.h" @@ -16,6 +17,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" +#include "data_update.h" #include "debug.h" #include "error.h" #include "extents.h" @@ -306,23 +308,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, #ifdef CONFIG_DEBUG_FS -/* XXX: bch_fs refcounting */ - -struct dump_iter { - struct bch_fs *c; - enum btree_id id; - struct bpos from; - struct bpos prev_node; - u64 iter; - - struct printbuf buf; - - char __user *ubuf; /* destination user buffer */ - size_t size; /* size of requested read */ - ssize_t ret; /* bytes read so far */ -}; - -static ssize_t flush_buf(struct dump_iter *i) +ssize_t bch2_debugfs_flush_buf(struct dump_iter *i) { if (i->buf.pos) { size_t bytes = min_t(size_t, i->buf.pos, i->size); @@ -360,7 +346,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file) return 0; } -static int bch2_dump_release(struct inode *inode, struct file *file) +int bch2_dump_release(struct inode *inode, struct file *file) { struct dump_iter *i = file->private_data; @@ -378,7 +364,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, i->size = size; i->ret = 0; - return flush_buf(i) ?: + return bch2_debugfs_flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, BTREE_ITER_prefetch| @@ -387,7 +373,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, prt_newline(&i->buf); bch2_trans_unlock(trans); i->from = bpos_successor(iter.pos); - flush_buf(i); + bch2_debugfs_flush_buf(i); }))) ?: i->ret; } @@ -408,7 +394,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, i->size = size; i->ret = 0; - ssize_t ret = flush_buf(i); + ssize_t ret = bch2_debugfs_flush_buf(i); if (ret) return ret; @@ -422,7 +408,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, ? bpos_successor(b->key.k.p) : b->key.k.p; - drop_locks_do(trans, flush_buf(i)); + drop_locks_do(trans, bch2_debugfs_flush_buf(i)); }))) ?: i->ret; } @@ -442,7 +428,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, i->size = size; i->ret = 0; - return flush_buf(i) ?: + return bch2_debugfs_flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, BTREE_ITER_prefetch| @@ -460,7 +446,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, bch2_bfloat_to_text(&i->buf, l->b, _k); bch2_trans_unlock(trans); i->from = bpos_successor(iter.pos); - flush_buf(i); + bch2_debugfs_flush_buf(i); }))) ?: i->ret; } @@ -521,7 +507,7 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, struct rhash_head *pos; struct btree *b; - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); if (ret) return ret; @@ -544,7 +530,7 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -618,7 +604,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, closure_put(&trans->ref); - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); if (ret) goto unlocked; @@ -631,7 +617,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -656,7 +642,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, i->ret = 0; while (1) { - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -699,7 +685,7 @@ static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, i->iter++; } - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -757,7 +743,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, while (1) { struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -878,7 +864,7 @@ static ssize_t bch2_simple_print(struct file *file, char __user *buf, ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -967,6 +953,8 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("write_points", 0400, c->fs_debug_dir, c->btree_debug, &write_points_ops); + bch2_fs_async_obj_debugfs_init(c); + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h index 2c37143b5fd1..52dbea736709 100644 --- a/fs/bcachefs/debug.h +++ b/fs/bcachefs/debug.h @@ -19,6 +19,24 @@ static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) } #ifdef CONFIG_DEBUG_FS +struct dump_iter { + struct bch_fs *c; + struct async_obj_list *list; + enum btree_id id; + struct bpos from; + struct bpos prev_node; + u64 iter; + + struct printbuf buf; + + char __user *ubuf; /* destination user buffer */ + size_t size; /* size of requested read */ + ssize_t ret; /* bytes read so far */ +}; + +ssize_t bch2_debugfs_flush_buf(struct dump_iter *); +int bch2_dump_release(struct inode *, struct file *); + void bch2_fs_debug_exit(struct bch_fs *); void bch2_fs_debug_init(struct bch_fs *); #else diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 6a4b3fe9ea99..1a52edc7c8d8 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -53,6 +53,7 @@ x(ENOMEM, ENOMEM_dio_write_bioset_init) \ x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ x(ENOMEM, ENOMEM_promote_table_init) \ + x(ENOMEM, ENOMEM_async_obj_init) \ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ x(ENOMEM, ENOMEM_compression_workspace_init) \ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index bed0f8a80212..f29965469b28 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -10,6 +10,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "bkey_sort.h" #include "btree_cache.h" #include "btree_gc.h" @@ -579,6 +580,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_free_pending_node_rewrites(c); bch2_free_fsck_errs(c); bch2_fs_accounting_exit(c); + bch2_fs_async_obj_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); @@ -971,6 +973,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts, } ret = + bch2_fs_async_obj_init(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: From 41e51769b8a649dd3db7070370cb6aa127f86307 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 21 Apr 2025 13:02:51 -0400 Subject: [PATCH 083/218] bcachefs: Make various async objs visible in debugfs Add async objs list for - promote_op - bch_read_bio - btree_read_bio - btree_write_bio This gets us introspection on in-flight async ops, and because under the hood it uses fast_lists (percpu slot buffer on top of a radix tree), it'll be fast enough to enable in production. This will be very helpful for debugging "something got stuck" issues, which have been cropping up from time to time (in the CI, especially with folio writeback). Signed-off-by: Kent Overstreet --- fs/bcachefs/async_objs.c | 23 +++++++++++++++++++- fs/bcachefs/async_objs_types.h | 6 +++++- fs/bcachefs/btree_io.c | 12 +++++++++++ fs/bcachefs/btree_io.h | 8 +++++++ fs/bcachefs/data_update.h | 15 +++++++++++++ fs/bcachefs/io_read.c | 39 ++++++++++++++++++++++------------ fs/bcachefs/io_read.h | 12 +++++++++++ 7 files changed, 100 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c index 8d78f390a759..57e2fe421461 100644 --- a/fs/bcachefs/async_objs.c +++ b/fs/bcachefs/async_objs.c @@ -12,6 +12,28 @@ #include +static void promote_obj_to_text(struct printbuf *out, void *obj) +{ + bch2_promote_op_to_text(out, obj); +} + +static void rbio_obj_to_text(struct printbuf *out, void *obj) +{ + bch2_read_bio_to_text(out, obj); +} + +static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj) +{ + struct btree_read_bio *rbio = obj; + bch2_btree_read_bio_to_text(out, rbio); +} + +static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj) +{ + struct btree_write_bio *wbio = obj; + bch2_bio_to_text(out, &wbio->wbio.bio); +} + static int bch2_async_obj_list_open(struct inode *inode, struct file *file) { struct async_obj_list *list = inode->i_private; @@ -65,7 +87,6 @@ static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf, return ret ?: i->ret; } -__maybe_unused static const struct file_operations async_obj_ops = { .owner = THIS_MODULE, .open = bch2_async_obj_list_open, diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h index 28cb73e3f56d..310a4f90f49b 100644 --- a/fs/bcachefs/async_objs_types.h +++ b/fs/bcachefs/async_objs_types.h @@ -2,7 +2,11 @@ #ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H #define _BCACHEFS_ASYNC_OBJS_TYPES_H -#define BCH_ASYNC_OBJ_LISTS() +#define BCH_ASYNC_OBJ_LISTS() \ + x(promote) \ + x(rbio) \ + x(btree_read_bio) \ + x(btree_write_bio) enum bch_async_obj_lists { #define x(n) BCH_ASYNC_OBJ_LIST_##n, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 8fe9e0fc6629..84dae4c1ec13 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "async_objs.h" #include "bkey_buf.h" #include "bkey_methods.h" #include "bkey_sort.h" @@ -1376,6 +1377,7 @@ static void btree_node_read_work(struct work_struct *work) } } + async_object_list_del(c, btree_read_bio, rb->list_idx); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], rb->start_time); bio_put(&rb->bio); @@ -1416,6 +1418,11 @@ static void btree_node_read_endio(struct bio *bio) queue_work(c->btree_read_complete_wq, &rb->work); } +void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio) +{ + bch2_bio_to_text(out, &rbio->bio); +} + struct btree_node_read_all { struct closure cl; struct bch_fs *c; @@ -1748,6 +1755,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bio->bi_end_io = btree_node_read_endio; bch2_bio_map(bio, b->data, btree_buf_bytes(b)); + async_object_list_add(c, btree_read_bio, rb, &rb->list_idx); + if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], bio_sectors(bio)); @@ -2121,6 +2130,7 @@ static void btree_node_write_work(struct work_struct *work) goto err; } out: + async_object_list_del(c, btree_write_bio, wbio->list_idx); bio_put(&wbio->wbio.bio); btree_node_write_done(c, b, start_time); return; @@ -2473,6 +2483,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) atomic64_inc(&c->btree_write_stats[type].nr); atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); + async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx); + INIT_WORK(&wbio->work, btree_write_submit); queue_work(c->btree_write_submit_wq, &wbio->work); return; diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index dbf76d22c660..afdb11a9f71c 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -41,6 +41,9 @@ struct btree_read_bio { u64 start_time; unsigned have_ioref:1; unsigned idx:7; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct extent_ptr_decoded pick; struct work_struct work; struct bio bio; @@ -53,6 +56,9 @@ struct btree_write_bio { unsigned data_bytes; unsigned sector_offset; u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct bch_write_bio wbio; }; @@ -133,6 +139,8 @@ void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); +void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *); + int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, unsigned); diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index ed05125867da..5e14d13568de 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -50,6 +50,21 @@ struct data_update { struct bio_vec *bvecs; }; +struct promote_op { + struct rcu_head rcu; + u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif + + struct rhash_head hash; + struct bpos pos; + + struct work_struct work; + struct data_update write; + struct bio_vec bi_inline_vecs[]; /* must be last */ +}; + void bch2_data_update_to_text(struct printbuf *, struct data_update *); void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index df96e2c8ceda..abfd3a4c1d7d 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -9,6 +9,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "btree_update.h" #include "buckets.h" #include "checksum.h" @@ -88,18 +89,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) /* Cache promotion on read */ -struct promote_op { - struct rcu_head rcu; - u64 start_time; - - struct rhash_head hash; - struct bpos pos; - - struct work_struct work; - struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ -}; - static const struct rhashtable_params bch_promote_params = { .head_offset = offsetof(struct promote_op, hash), .key_offset = offsetof(struct promote_op, pos), @@ -177,6 +166,8 @@ static noinline void promote_free(struct bch_read_bio *rbio) bch_promote_params); BUG_ON(ret); + async_object_list_del(c, promote, op->list_idx); + bch2_data_update_exit(&op->write); enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); @@ -262,6 +253,10 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, goto err; } + ret = async_object_list_add(c, promote, op, &op->list_idx); + if (ret < 0) + goto err_remove_hash; + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), &orig->opts, @@ -273,7 +268,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, * -BCH_ERR_ENOSPC_disk_reservation: */ if (ret) - goto err_remove_hash; + goto err_remove_list; rbio_init_fragment(&op->write.rbio.bio, orig); op->write.rbio.bounce = true; @@ -281,6 +276,8 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, op->write.op.end_io = promote_done; return &op->write.rbio; +err_remove_list: + async_object_list_del(c, promote, op->list_idx); err_remove_hash: BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, bch_promote_params)); @@ -353,6 +350,18 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, return NULL; } +void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) +{ + if (!op->write.read_done) { + prt_printf(out, "parent read: %px\n", op->write.rbio.parent); + printbuf_indent_add(out, 2); + bch2_read_bio_to_text(out, op->write.rbio.parent); + printbuf_indent_sub(out, 2); + } + + bch2_data_update_to_text(out, &op->write); +} + /* Read */ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, @@ -421,6 +430,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) else promote_free(rbio); } else { + async_object_list_del(rbio->c, rbio, rbio->list_idx); + if (rbio->bounce) bch2_bio_free_pages_pool(rbio->c, &rbio->bio); @@ -1246,6 +1257,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; + async_object_list_add(c, rbio, rbio, &rbio->list_idx); + if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 13bb68eb91c4..c08b9c047b3e 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -4,6 +4,7 @@ #include "bkey_buf.h" #include "btree_iter.h" +#include "extents_types.h" #include "reflink.h" struct bch_read_bio { @@ -48,6 +49,9 @@ struct bch_read_bio { u16 _state; }; s16 ret; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct extent_ptr_decoded pick; @@ -173,6 +177,9 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, rbio->split = true; rbio->parent = orig; rbio->opts = orig->opts; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + rbio->list_idx = 0; +#endif return rbio; } @@ -190,9 +197,14 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, rbio->ret = 0; rbio->opts = opts; rbio->bio.bi_end_io = end_io; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + rbio->list_idx = 0; +#endif return rbio; } +struct promote_op; +void bch2_promote_op_to_text(struct printbuf *, struct promote_op *); void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *); void bch2_fs_io_read_exit(struct bch_fs *); From dbc18c97f1f0d336e3c4f6bb50f34c5255216995 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 06:03:33 -0400 Subject: [PATCH 084/218] bcachefs: print_string_as_lines: avoid printing empty line If the final line in in the message to be printed is blang, don't print it. This happens with indented printbufs - after a newline we emit spaces up to the indent level. Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 7e6ebe8cd9ea..1cff407c8c9d 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -252,6 +252,16 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v) bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); } +static bool string_is_spaces(const char *str) +{ + while (*str) { + if (*str != ' ') + return false; + str++; + } + return true; +} + void bch2_print_string_as_lines(const char *prefix, const char *lines, bool nonblocking) { @@ -272,6 +282,9 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines, while (*lines) { p = strchrnul(lines, '\n'); + if (!*p && string_is_spaces(lines)) + break; + printk("%s%.*s\n", prefix, (int) (p - lines), lines); if (!*p) break; From 353b89c6e6df522c221997a527358854b1c826d7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 05:45:48 -0400 Subject: [PATCH 085/218] bcachefs: bch2_io_failures_to_text() Pretty printer for bch_io_failures, to be used for better read error messages. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 40 ++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/extents.h | 2 ++ 2 files changed, 42 insertions(+) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index ef116c55f0a7..8a881b30fd4c 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -45,6 +45,46 @@ static void bch2_extent_crc_pack(union bch_extent_crc *, struct bch_extent_crc_unpacked, enum bch_extent_entry_type); +void bch2_io_failures_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_io_failures *failed) +{ + static const char * const error_types[] = { + "io", "checksum", "ec reconstruct", NULL + }; + + for (struct bch_dev_io_failures *f = failed->devs; + f < failed->devs + failed->nr; + f++) { + bch2_printbuf_make_room(out, 1024); + rcu_read_lock(); + out->atomic++; + struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); + if (ca) + prt_str(out, ca->name); + else + prt_printf(out, "(invalid device %u)", f->dev); + --out->atomic; + rcu_read_unlock(); + + prt_char(out, ' '); + + unsigned errflags = + ((!!f->failed_io) << 0) | + ((!!f->failed_csum_nr) << 1) | + ((!!f->failed_ec) << 2); + + if (is_power_of_2(errflags)) { + prt_bitflags(out, error_types, errflags); + prt_str(out, " error"); + } else { + prt_str(out, "errors: "); + prt_bitflags(out, error_types, errflags); + } + prt_newline(out); + } +} + struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, unsigned dev) { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 9fe153183b36..9dd2655a5774 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -392,6 +392,8 @@ out: \ /* utility code common to all keys with pointers: */ +void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *, + struct bch_io_failures *); struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, From 156d9e8341e8aad55b0e79b2dc54003cb14e5077 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 05:49:20 -0400 Subject: [PATCH 086/218] bcachefs: Emit a single log message on data read error Instead of emitting a message immediately when we get an error in the read path, and then another at the end if we successfully retry - emit one single log message before returning from bch2_rbio_retry(). Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 10 +++++ fs/bcachefs/io_read.c | 83 +++++++++++------------------------------- 2 files changed, 31 insertions(+), 62 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 94e3edd932e3..8989ea4a3934 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -295,6 +295,16 @@ do { \ bch2_print(_c, __VA_ARGS__); \ } while (0) +#define bch2_print_str_ratelimited(_c, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + \ + if (__ratelimit(&_rs)) \ + bch2_print_str(_c, __VA_ARGS__); \ +} while (0) + #define bch_info(c, fmt, ...) \ bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_info_ratelimited(c, fmt, ...) \ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index abfd3a4c1d7d..cc708d46557e 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -582,7 +582,6 @@ static void bch2_rbio_retry(struct work_struct *work) .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; - int orig_error = rbio->ret; struct btree_trans *trans = bch2_trans_get(c); @@ -623,10 +622,11 @@ static void bch2_rbio_retry(struct work_struct *work) if (ret) { rbio->ret = ret; rbio->bio.bi_status = BLK_STS_IOERR; - } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace && - orig_error != -BCH_ERR_data_read_ptr_stale_race && - !failed.nr) { + } + + if (failed.nr || ret) { struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, @@ -634,9 +634,22 @@ static void bch2_rbio_retry(struct work_struct *work) read_pos.offset << 9)); if (rbio->data_update) prt_str(&buf, "(internal move) "); - prt_str(&buf, "successful retry"); - bch_err_ratelimited(c, "%s", buf.buf); + prt_str(&buf, "data read error, "); + if (!ret) + prt_str(&buf, "successful retry"); + else + prt_str(&buf, bch2_err_str(ret)); + prt_newline(&buf); + + if (!bkey_deleted(&sk.k->k)) { + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); + prt_newline(&buf); + } + + bch2_io_failures_to_text(&buf, c, &failed); + + bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } @@ -671,27 +684,6 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, } } -static void bch2_read_io_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bio *bio = &rbio->bio; - struct bch_fs *c = rbio->c; - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); - - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - printbuf_exit(&buf); - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); -} - static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_read_bio *rbio) { @@ -755,31 +747,6 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) __bch2_rbio_narrow_crcs(trans, rbio)); } -static void bch2_read_csum_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_str(&buf, "data "); - bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); - printbuf_exit(&buf); -} - static void bch2_read_decompress_err(struct work_struct *work) { struct bch_read_bio *rbio = @@ -940,7 +907,7 @@ static void __bch2_read_endio(struct work_struct *work) memalloc_nofs_restore(nofs_flags); return; csum_err: - bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); goto out; decompression_err: bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); @@ -966,7 +933,7 @@ static void bch2_read_endio(struct bio *bio) rbio->bio.bi_end_io = rbio->end_io; if (unlikely(bio->bi_status)) { - bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); return; } @@ -1292,14 +1259,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) { - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); - prt_printf(&buf, "no device to read from:\n "); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_device_offline, BLK_STS_IOERR); From b3bbd47f8314997b7a13cfe0b2048a19a5b62fcf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 09:16:49 -0400 Subject: [PATCH 087/218] bcachefs: Kill redundant error message in topology repair The btree node read path already logs btree node read errors, this isn't needed. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 2e72784332ff..fecf88079127 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -371,10 +371,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct prt_char(&buf, ' '); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), - trans, btree_node_read_error, - "Topology repair: unreadable btree node at\n%s", - buf.buf)) { + if (bch2_err_matches(ret, EIO)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; ret = bch2_journal_key_delete(c, b->c.btree_id, From 3be132f93cff2586be482cb81807ff83899f572e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Apr 2025 09:09:56 -0400 Subject: [PATCH 088/218] bcachefs: bch2_btree_lost_data() now handles snapshots tree We have a consolidated places for "this btree lost data, run this repair", so use it. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 4 ++++ fs/bcachefs/snapshot.c | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 2a8bcb9b1dd2..8f45d9e3a47e 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -95,6 +95,10 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) case BTREE_ID_accounting: ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; goto out; + case BTREE_ID_snapshots: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + goto out; default: ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; goto out; diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 14ea09ccee37..94cf60f76b64 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1743,10 +1743,6 @@ int bch2_snapshots_read(struct bch_fs *c) BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && test_bit(BCH_FS_may_go_rw, &c->flags)); - if (bch2_err_matches(ret, EIO) || - (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); - return ret; } From 3aecbb01a168bf6396955e5da0533f6e5f000441 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Apr 2025 09:13:28 -0400 Subject: [PATCH 089/218] bcachefs: Remove redundant calls to btree_lost_data() The btree node read path calls this before returning the read error. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 7 ------- fs/bcachefs/recovery.c | 3 --- 2 files changed, 10 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index fecf88079127..92ae31737a24 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -378,10 +378,6 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct b->c.level, cur_k.k->k.p); if (ret) break; - - ret = bch2_btree_lost_data(c, b->c.btree_id); - if (ret) - break; continue; } @@ -543,9 +539,6 @@ int bch2_check_topology(struct bch_fs *c) bch2_btree_id_to_text(&buf, i); if (r->error) { - ret = bch2_btree_lost_data(c, i); - if (ret) - break; reconstruct_root: bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 8f45d9e3a47e..a0b42cca86fb 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -587,9 +587,6 @@ static int read_btree_roots(struct bch_fs *c) buf.buf, bch2_err_str(ret))) { if (btree_id_is_alloc(i)) r->error = 0; - - ret = bch2_btree_lost_data(c, i); - BUG_ON(ret); } } From 300904700f14e4e05db2a16cf8e3890c8e856cf8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Apr 2025 09:28:56 -0400 Subject: [PATCH 090/218] bcachefs: kill bch2_run_explicit_recovery_pass_persistent() No longer has users, so we can kill it and rename bch2_run_explicit_recovery_pass_persistent_locked(). Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 30 +++++++++++++++--------------- fs/bcachefs/recovery_passes.c | 19 +------------------ fs/bcachefs/recovery_passes.h | 1 - 3 files changed, 16 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index a0b42cca86fb..b1afbe446d9e 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -50,24 +50,24 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) } /* Once we have runtime self healing for topology errors we won't need this: */ - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_topology) ?: ret; /* Btree node accounting will be off: */ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; #ifdef CONFIG_BCACHEFS_DEBUG /* * These are much more minor, and don't need to be corrected right away, * but in debug mode we want the next fsck run to be clean: */ - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; #endif switch (btree) { case BTREE_ID_alloc: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); @@ -77,30 +77,30 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); goto out; case BTREE_ID_backpointers: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; goto out; case BTREE_ID_need_discard: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_freespace: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_bucket_gens: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_lru: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_accounting: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; goto out; case BTREE_ID_snapshots: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; goto out; default: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; goto out; } out: diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 87150dd30f4b..9be715a49454 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -193,7 +193,7 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, return ret; } -int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, enum bch_recovery_pass pass) { lockdep_assert_held(&c->sb_lock); @@ -204,23 +204,6 @@ int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, return bch2_run_explicit_recovery_pass(c, pass); } -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - if (!test_bit_le64(s, ext->recovery_passes_required)) { - __set_bit_le64(s, ext->recovery_passes_required); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); - - return bch2_run_explicit_recovery_pass(c, pass); -} - static void bch2_clear_recovery_pass_required(struct bch_fs *c, enum bch_recovery_pass pass) { diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index e19a8aaba2f8..62957e268a66 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -12,7 +12,6 @@ int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *, struct printbuf *, enum bch_recovery_pass); int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); -int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); int bch2_run_online_recovery_passes(struct bch_fs *); From 600a9207c8def056b4681fde8158c463576d5aca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Apr 2025 09:27:10 -0400 Subject: [PATCH 091/218] bcachefs: Plumb printbuf through bch2_btree_lost_data() Part of the ongoing project to improve error messages by building them up in printbufs and emitting them all at once, so that we can easily see what events are related in the log. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 28 +++++++++++++++-------- fs/bcachefs/recovery.c | 42 ++++++++++++++++++----------------- fs/bcachefs/recovery.h | 2 +- fs/bcachefs/recovery_passes.c | 9 ++++---- fs/bcachefs/recovery_passes.h | 4 +++- 5 files changed, 50 insertions(+), 35 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 84dae4c1ec13..41df1035ba2f 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1304,7 +1304,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, retry_read = 1; } else { set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); } goto out; } @@ -1372,15 +1371,16 @@ static void btree_node_read_work(struct work_struct *work) if (!can_retry) { set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); break; } } - - async_object_list_del(c, btree_read_bio, rb->list_idx); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], - rb->start_time); - bio_put(&rb->bio); + if (btree_node_read_error(b)) { + struct printbuf buf = PRINTBUF; + bch2_btree_lost_data(c, &buf, b->c.btree_id); + if (buf.pos) + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + } if ((saw_error || btree_node_need_rewrite(b)) && @@ -1398,6 +1398,10 @@ static void btree_node_read_work(struct work_struct *work) bch2_btree_node_rewrite_async(c, b); } + async_object_list_del(c, btree_read_bio, rb->list_idx); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); + bio_put(&rb->bio); printbuf_exit(&buf); clear_btree_node_read_in_flight(b); smp_mb__after_atomic(); @@ -1587,7 +1591,12 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) if (ret) { set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); + + struct printbuf buf = PRINTBUF; + bch2_btree_lost_data(c, &buf, b->c.btree_id); + if (buf.pos) + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); } else if (*saw_error) bch2_btree_node_rewrite_async(c, b); @@ -1721,6 +1730,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, prt_str(&buf, "btree node read error: no device to read from\n at "); bch2_btree_pos_to_text(&buf, c, b); + prt_newline(&buf); + bch2_btree_lost_data(c, &buf, b->c.btree_id); bch_err_ratelimited(c, "%s", buf.buf); if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && @@ -1728,7 +1739,6 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bch2_fatal_error(c); set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); clear_btree_node_read_in_flight(b); smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b1afbe446d9e..d13a6df289c7 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -33,7 +33,9 @@ #include #include -int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) +int bch2_btree_lost_data(struct bch_fs *c, + struct printbuf *msg, + enum btree_id btree) { u64 b = BIT_ULL(btree); int ret = 0; @@ -42,32 +44,32 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); if (!(c->sb.btrees_lost_data & b)) { - struct printbuf buf = PRINTBUF; - bch2_btree_id_to_text(&buf, btree); - bch_err(c, "flagging btree %s lost data", buf.buf); - printbuf_exit(&buf); + prt_printf(msg, "flagging btree "); + bch2_btree_id_to_text(msg, btree); + prt_printf(msg, " lost data\n"); + ext->btrees_lost_data |= cpu_to_le64(b); } /* Once we have runtime self healing for topology errors we won't need this: */ - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_topology) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_topology) ?: ret; /* Btree node accounting will be off: */ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_allocations) ?: ret; #ifdef CONFIG_BCACHEFS_DEBUG /* * These are much more minor, and don't need to be corrected right away, * but in debug mode we want the next fsck run to be clean: */ - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_lrus) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; #endif switch (btree) { case BTREE_ID_alloc: - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); @@ -77,30 +79,30 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); goto out; case BTREE_ID_backpointers: - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; goto out; case BTREE_ID_need_discard: - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_freespace: - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_bucket_gens: - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_lru: - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_accounting: - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_allocations) ?: ret; goto out; case BTREE_ID_snapshots: - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; goto out; default: - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; goto out; } out: diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index d858ba674eaa..c023f52fc2d6 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -int bch2_btree_lost_data(struct bch_fs *, enum btree_id); +int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id); void bch2_reconstruct_alloc(struct bch_fs *); int bch2_journal_replay(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 9be715a49454..347e17fe7901 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -141,13 +141,13 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out, if (pass < BCH_RECOVERY_PASS_set_may_go_rw && c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { if (print) - prt_printf(out, "need recovery pass %s (%u), but already rw", + prt_printf(out, "need recovery pass %s (%u), but already rw\n", bch2_recovery_passes[pass], pass); return -BCH_ERR_cannot_rewind_recovery; } if (print) - prt_printf(out, "running explicit recovery pass %s (%u), currently at %s (%u)", + prt_printf(out, "running explicit recovery pass %s (%u), currently at %s (%u)\n", bch2_recovery_passes[pass], pass, bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); @@ -194,14 +194,15 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, } int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, - enum bch_recovery_pass pass) + struct printbuf *out, + enum bch_recovery_pass pass) { lockdep_assert_held(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); - return bch2_run_explicit_recovery_pass(c, pass); + return bch2_run_explicit_recovery_pass_printbuf(c, out, pass); } static void bch2_clear_recovery_pass_required(struct bch_fs *c, diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 62957e268a66..1f91be4258c5 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -12,7 +12,9 @@ int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *, struct printbuf *, enum bch_recovery_pass); int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); + +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass); int bch2_run_online_recovery_passes(struct bch_fs *); int bch2_run_recovery_passes(struct bch_fs *); From d31f155964aee6e6141967fc392a9a99b221e117 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 09:02:15 -0400 Subject: [PATCH 092/218] bcachefs: bch2_fsck_err_opt() Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 + fs/bcachefs/error.c | 42 ++++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/error.h | 4 ++++ 3 files changed, 47 insertions(+) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 1a52edc7c8d8..4aac0182cbed 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -175,6 +175,7 @@ x(0, backpointer_to_overwritten_btree_node) \ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ + x(BCH_ERR_fsck, fsck_ask) \ x(BCH_ERR_fsck, fsck_fix) \ x(BCH_ERR_fsck, fsck_delete_bkey) \ x(BCH_ERR_fsck, fsck_ignore) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 20495062d6e1..731733e12e6b 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -393,6 +393,48 @@ bool __bch2_count_fsck_err(struct bch_fs *c, return print && !repeat; } +int bch2_fsck_err_opt(struct bch_fs *c, + enum bch_fsck_flags flags, + enum bch_sb_error_id err) +{ + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + flags |= fsck_flags_extra[err]; + + if (test_bit(BCH_FS_fsck_running, &c->flags)) { + if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) + return -BCH_ERR_fsck_repair_unimplemented; + + switch (c->opts.fix_errors) { + case FSCK_FIX_exit: + return -BCH_ERR_fsck_errors_not_fixed; + case FSCK_FIX_yes: + if (flags & FSCK_CAN_FIX) + return -BCH_ERR_fsck_fix; + fallthrough; + case FSCK_FIX_no: + if (flags & FSCK_CAN_IGNORE) + return -BCH_ERR_fsck_ignore; + return -BCH_ERR_fsck_errors_not_fixed; + case FSCK_FIX_ask: + if (flags & FSCK_AUTOFIX) + return -BCH_ERR_fsck_fix; + return -BCH_ERR_fsck_ask; + default: + BUG(); + } + } else { + if ((flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) + return -BCH_ERR_fsck_fix; + + if (c->opts.errors == BCH_ON_ERROR_continue && + (flags & FSCK_CAN_IGNORE)) + return -BCH_ERR_fsck_ignore; + return -BCH_ERR_fsck_errors_not_fixed; + } +} + int __bch2_fsck_err(struct bch_fs *c, struct btree_trans *trans, enum bch_fsck_flags flags, diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 0b3ede1c2015..d89dd270b2e5 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -80,6 +80,10 @@ bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbu #define bch2_count_fsck_err(_c, _err, ...) \ __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) +int bch2_fsck_err_opt(struct bch_fs *, + enum bch_fsck_flags, + enum bch_sb_error_id); + __printf(5, 6) __cold int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, enum bch_fsck_flags, From 9c2472658be20d04c6dc34d5314a7e99cc4fed25 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 20:38:50 -0400 Subject: [PATCH 093/218] bcachefs: bch2_mark_btree_validate_failure() Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 31 ++++++++++++++++++++++++++----- fs/bcachefs/extents.h | 1 + fs/bcachefs/extents_types.h | 1 + 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 8a881b30fd4c..c4fe4ffd41f1 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -56,6 +56,14 @@ void bch2_io_failures_to_text(struct printbuf *out, for (struct bch_dev_io_failures *f = failed->devs; f < failed->devs + failed->nr; f++) { + unsigned errflags = + ((!!f->failed_io) << 0) | + ((!!f->failed_csum_nr) << 1) | + ((!!f->failed_ec) << 2); + + if (!errflags) + continue; + bch2_printbuf_make_room(out, 1024); rcu_read_lock(); out->atomic++; @@ -69,11 +77,6 @@ void bch2_io_failures_to_text(struct printbuf *out, prt_char(out, ' '); - unsigned errflags = - ((!!f->failed_io) << 0) | - ((!!f->failed_csum_nr) << 1) | - ((!!f->failed_ec) << 2); - if (is_power_of_2(errflags)) { prt_bitflags(out, error_types, errflags); prt_str(out, " error"); @@ -119,6 +122,22 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, f->failed_csum_nr++; } +void bch2_mark_btree_validate_failure(struct bch_io_failures *failed, + unsigned dev) +{ + struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev); + + if (!f) { + BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); + + f = &failed->devs[failed->nr++]; + memset(f, 0, sizeof(*f)); + f->dev = dev; + } + + f->failed_btree_validate = true; +} + static inline u64 dev_latency(struct bch_dev *ca) { return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; @@ -219,6 +238,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { have_io_errors |= f->failed_io; + have_io_errors |= f->failed_btree_validate; have_io_errors |= f->failed_ec; } have_csum_errors |= !!f->failed_csum_nr; @@ -226,6 +246,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (p.has_ec && (f->failed_io || f->failed_csum_nr)) p.do_ec_reconstruct = true; else if (f->failed_io || + f->failed_btree_validate || f->failed_csum_nr > c->opts.checksum_err_retry_nr) continue; } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 9dd2655a5774..b8590e51b76e 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -398,6 +398,7 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, struct extent_ptr_decoded *, bool); +void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *, int); diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index e51529dca4c2..b23ce4a373c0 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -34,6 +34,7 @@ struct bch_io_failures { u8 dev; unsigned failed_csum_nr:6, failed_io:1, + failed_btree_validate:1, failed_ec:1; } devs[BCH_REPLICAS_MAX + 1]; }; From cd3cdb1ef706a1ac725194d81858d58375739b25 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 09:14:19 -0400 Subject: [PATCH 094/218] bcachefs: Single err message for btree node reads Like we just did with the data read path, emit a single error message per btree node reads, nicely formatted, with all the actions we took grouped together. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 260 +++++++++++++++++++++++------------------ fs/bcachefs/btree_io.h | 4 +- fs/bcachefs/debug.c | 4 +- 3 files changed, 154 insertions(+), 114 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 41df1035ba2f..e079e12adf86 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -516,19 +516,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, + bool print_pos, struct btree *b, struct bset *i, struct bkey_packed *k, - unsigned offset, int write) + unsigned offset, int rw) { - prt_printf(out, bch2_log_msg(c, "%s"), - write == READ - ? "error validating btree node " - : "corrupt btree node before write "); - if (ca) - prt_printf(out, "on %s ", ca->name); - prt_printf(out, "at btree "); - bch2_btree_pos_to_text(out, c, b); + if (print_pos) { + prt_str(out, rw == READ + ? "error validating btree node " + : "corrupt btree node before write "); + prt_printf(out, "at btree "); + bch2_btree_pos_to_text(out, c, b); + prt_newline(out); + } - prt_printf(out, "\nnode offset %u/%u", + if (ca) + prt_printf(out, "%s ", ca->name); + + prt_printf(out, "node offset %u/%u", b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); @@ -539,75 +543,110 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_str(out, ": "); } -__printf(10, 11) +__printf(11, 12) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, struct bkey_packed *k, - int write, - bool have_retry, + int rw, enum bch_sb_error_id err_type, + struct bch_io_failures *failed, + struct printbuf *err_msg, const char *fmt, ...) { - bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; + if (c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) + return -BCH_ERR_fsck_fix; + + bool have_retry = false; + int ret2; + + if (ca) { + bch2_mark_btree_validate_failure(failed, ca->dev_idx); + + struct extent_ptr_decoded pick; + have_retry = !bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + failed, &pick, -1); + } if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) ret = -BCH_ERR_btree_node_read_err_fixable; if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) ret = -BCH_ERR_btree_node_read_err_bad_node; - if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) - bch2_sb_error_count(c, err_type); + bch2_sb_error_count(c, err_type); + + bool print_deferred = err_msg && + rw == READ && + !(test_bit(BCH_FS_fsck_running, &c->flags) && + c->opts.fix_errors == FSCK_FIX_ask); struct printbuf out = PRINTBUF; - if (write != WRITE && ret != -BCH_ERR_btree_node_read_err_fixable) { - printbuf_indent_add_nextline(&out, 2); -#ifdef BCACHEFS_LOG_PREFIX - prt_printf(&out, bch2_log_msg(c, "")); -#endif - } + bch2_log_msg_start(c, &out); - btree_err_msg(&out, c, ca, b, i, k, b->written, write); + if (!print_deferred) + err_msg = &out; + + btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw); va_list args; va_start(args, fmt); - prt_vprintf(&out, fmt, args); + prt_vprintf(err_msg, fmt, args); va_end(args); - if (write == WRITE) { + if (print_deferred) { + prt_newline(err_msg); + + switch (ret) { + case -BCH_ERR_btree_node_read_err_fixable: + ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); + if (ret2 != -BCH_ERR_fsck_fix && + ret2 != -BCH_ERR_fsck_ignore) { + ret = ret2; + goto fsck_err; + } + + if (!have_retry) + ret = -BCH_ERR_fsck_fix; + goto out; + case -BCH_ERR_btree_node_read_err_bad_node: + prt_str(&out, ", "); + ret = __bch2_topology_error(c, &out); + break; + } + + goto out; + } + + if (rw == WRITE) { prt_str(&out, ", "); ret = __bch2_inconsistent_error(c, &out) ? -BCH_ERR_fsck_errors_not_fixed : 0; - silent = false; + goto print; } switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: - ret = !silent - ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf) - : -BCH_ERR_fsck_fix; - if (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore) + ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); + if (ret2 != -BCH_ERR_fsck_fix && + ret2 != -BCH_ERR_fsck_ignore) { + ret = ret2; goto fsck_err; - ret = -BCH_ERR_fsck_fix; + } + + if (!have_retry) + ret = -BCH_ERR_fsck_fix; goto out; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); ret = __bch2_topology_error(c, &out); - if (ret) - silent = false; - break; - case -BCH_ERR_btree_node_read_err_incompatible: - ret = -BCH_ERR_fsck_errors_not_fixed; - silent = false; break; } - - if (!silent) - bch2_print_str(c, KERN_ERR, out.buf); +print: + bch2_print_str(c, KERN_ERR, out.buf); out: fsck_err: printbuf_exit(&out); @@ -616,8 +655,9 @@ static int __btree_err(int ret, #define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ + int _ret = __btree_err(type, c, ca, b, i, k, write, \ BCH_FSCK_ERR_##_err_type, \ + failed, err_msg, \ msg, ##__VA_ARGS__); \ \ if (_ret != -BCH_ERR_fsck_fix) { \ @@ -625,7 +665,7 @@ static int __btree_err(int ret, goto fsck_err; \ } \ \ - *saw_error = true; \ + true; \ }) #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) @@ -683,8 +723,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, - unsigned offset, unsigned sectors, - int write, bool have_retry, bool *saw_error) + unsigned offset, unsigned sectors, int write, + struct bch_io_failures *failed, + struct printbuf *err_msg) { unsigned version = le16_to_cpu(i->version); unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); @@ -897,7 +938,8 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b, static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bset *i, int write, - bool have_retry, bool *saw_error) + struct bch_io_failures *failed, + struct printbuf *err_msg) { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; @@ -1010,7 +1052,9 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, } int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, bool have_retry, bool *saw_error) + struct btree *b, + struct bch_io_failures *failed, + struct printbuf *err_msg) { struct btree_node_entry *bne; struct sort_iter *iter; @@ -1023,7 +1067,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); u64 max_journal_seq = 0; struct printbuf buf = PRINTBUF; - int ret = 0, retry_read = 0, write = READ; + int ret = 0, write = READ; u64 start_time = local_clock(); b->version_ondisk = U16_MAX; @@ -1157,15 +1201,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, b->version_ondisk = min(b->version_ondisk, le16_to_cpu(i->version)); - ret = validate_bset(c, ca, b, i, b->written, sectors, - READ, have_retry, saw_error); + ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg); if (ret) goto fsck_err; if (!b->written) btree_node_set_format(b, b->data->format); - ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); + ret = validate_bset_keys(c, b, i, READ, failed, err_msg); if (ret) goto fsck_err; @@ -1293,19 +1336,11 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (!ptr_written) set_btree_node_need_rewrite(b); -out: +fsck_err: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); - return retry_read; -fsck_err: - if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) { - retry_read = 1; - } else { - set_btree_node_read_error(b); - } - goto out; + return ret; } static void btree_node_read_work(struct work_struct *work) @@ -1317,15 +1352,25 @@ static void btree_node_read_work(struct work_struct *work) struct btree *b = rb->b; struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; + int ret = 0; + struct printbuf buf = PRINTBUF; - bool saw_error = false; - bool retry = false; - bool can_retry; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "btree node read error at btree "); + bch2_btree_pos_to_text(&buf, c, b); + prt_newline(&buf); goto start; while (1) { - retry = true; - bch_info(c, "retrying read"); + ret = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + &failed, &rb->pick, -1); + if (ret) { + set_btree_node_read_error(b); + break; + } + ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = ca != NULL; rb->start_time = local_clock(); @@ -1343,60 +1388,54 @@ static void btree_node_read_work(struct work_struct *work) bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, rb->start_time, !bio->bi_status); start: - printbuf_reset(&buf); - bch2_btree_pos_to_text(&buf, c, b); - - if (ca && bio->bi_status) - bch_err_dev_ratelimited(ca, - "btree read error %s for %s", - bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = false; - bch2_mark_io_failure(&failed, &rb->pick, false); - - can_retry = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), - &failed, &rb->pick, -1) > 0; - - if (!bio->bi_status && - !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { - if (retry) - bch_info(c, "retry success"); - break; + if (bio->bi_status) { + bch2_mark_io_failure(&failed, &rb->pick, false); + continue; } - saw_error = true; + ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); + if (ret == -BCH_ERR_btree_node_read_err_want_retry || + ret == -BCH_ERR_btree_node_read_err_must_retry) + continue; - if (!can_retry) { + if (ret) set_btree_node_read_error(b); - break; - } - } - if (btree_node_read_error(b)) { - struct printbuf buf = PRINTBUF; - bch2_btree_lost_data(c, &buf, b->c.btree_id); - if (buf.pos) - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); + + break; } - if ((saw_error || + bch2_io_failures_to_text(&buf, c, &failed); + + if (btree_node_read_error(b)) + bch2_btree_lost_data(c, &buf, b->c.btree_id); + + /* + * only print retry success if we read from a replica with no errors + */ + if (btree_node_read_error(b)) + prt_printf(&buf, "ret %s", bch2_err_str(ret)); + else if (failed.nr) { + if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev)) + prt_printf(&buf, "retry success"); + else + prt_printf(&buf, "repair success"); + } + + if ((failed.nr || btree_node_need_rewrite(b)) && !btree_node_read_error(b) && c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { - if (saw_error) { - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", - __func__, buf.buf); - } - + prt_printf(&buf, " (rewriting node)"); bch2_btree_node_rewrite_async(c, b); } + prt_newline(&buf); + + if (failed.nr) + bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); async_object_list_del(c, btree_read_bio, rb->list_idx); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], @@ -1486,12 +1525,13 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) struct btree *b = ra->b; struct printbuf buf = PRINTBUF; bool dump_bset_maps = false; - bool have_retry = false; int ret = 0, best = -1, write = READ; unsigned i, written = 0, written2 = 0; __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; bool _saw_error = false, *saw_error = &_saw_error; + struct printbuf *err_msg = NULL; + struct bch_io_failures *failed = NULL; for (i = 0; i < ra->nr; i++) { struct btree_node *bn = ra->buf[i]; @@ -1584,7 +1624,7 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) if (best >= 0) { memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); - ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); + ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL); } else { ret = -1; } @@ -2211,8 +2251,6 @@ static void btree_node_write_endio(struct bio *bio) static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - bool saw_error; - int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), (struct bkey_validate_context) { .from = BKEY_VALIDATE_btree_node, @@ -2225,8 +2263,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, return ret; } - ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: - validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); + ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: + validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL); if (ret) { bch2_inconsistent_error(c); dump_stack(); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index afdb11a9f71c..30a5180532c8 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -134,7 +134,9 @@ void bch2_btree_build_aux_trees(struct btree *); void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, - struct btree *, bool, bool *); + struct btree *, + struct bch_io_failures *, + struct printbuf *); void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 079bc2b359cd..4ee5d486b305 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -42,7 +42,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct btree_node *n_sorted = c->verify_data->data; struct bset *sorted, *inmemory = &b->data->keys; struct bio *bio; - bool failed = false, saw_error = false; + bool failed = false; struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_verify_replicas); @@ -66,7 +66,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); v->written = 0; - if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) + if (bch2_btree_node_read_done(c, ca, v, NULL, NULL)) return false; n_sorted = c->verify_data->data; From c21f41f6905be4fc5059a10a5bba94105ba87269 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Apr 2025 17:55:20 -0400 Subject: [PATCH 095/218] bcachefs: bch2_dirent_to_text() shows casefolded dirents Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index a51195088227..d198001838f3 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -212,12 +212,19 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); - prt_printf(out, "%.*s -> ", d_name.len, d_name.name); + prt_printf(out, "%.*s", d_name.len, d_name.name); + + if (d.v->d_casefold) { + struct qstr d_name = bch2_dirent_get_lookup_name(d); + prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name); + } + + prt_str(out, " ->"); if (d.v->d_type != DT_SUBVOL) - prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); + prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum)); else - prt_printf(out, "%u -> %u", + prt_printf(out, " %u -> %u", le32_to_cpu(d.v->d_parent_subvol), le32_to_cpu(d.v->d_child_subvol)); From aff2b6a7fc285287f7ffc6691aca333a63b18230 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 26 Apr 2025 12:38:53 -0400 Subject: [PATCH 096/218] bcachefs: provide unlocked version of run_explicit_recovery_pass_persistent Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/recovery.c | 30 +++++++++++++++--------------- fs/bcachefs/recovery_passes.c | 20 +++++++++++++++++--- fs/bcachefs/recovery_passes.h | 2 ++ fs/bcachefs/super-io.c | 3 +++ 5 files changed, 38 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 8989ea4a3934..0369dd656d32 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -844,6 +844,7 @@ struct bch_fs { unsigned nsec_per_time_unit; u64 features; u64 compat; + u64 recovery_passes_required; unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; u64 btrees_lost_data; } sb; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d13a6df289c7..375111b56029 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -52,24 +52,24 @@ int bch2_btree_lost_data(struct bch_fs *c, } /* Once we have runtime self healing for topology errors we won't need this: */ - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_topology) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_topology) ?: ret; /* Btree node accounting will be off: */ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_allocations) ?: ret; #ifdef CONFIG_BCACHEFS_DEBUG /* * These are much more minor, and don't need to be corrected right away, * but in debug mode we want the next fsck run to be clean: */ - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_lrus) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_lrus) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; #endif switch (btree) { case BTREE_ID_alloc: - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); @@ -79,30 +79,30 @@ int bch2_btree_lost_data(struct bch_fs *c, __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); goto out; case BTREE_ID_backpointers: - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; goto out; case BTREE_ID_need_discard: - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_freespace: - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_bucket_gens: - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_lru: - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; goto out; case BTREE_ID_accounting: - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_allocations) ?: ret; goto out; case BTREE_ID_snapshots: - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; goto out; default: - ret = bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; goto out; } out: diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 347e17fe7901..97af1e0629eb 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -193,9 +193,9 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, return ret; } -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass) +int __bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass) { lockdep_assert_held(&c->sb_lock); @@ -205,6 +205,20 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, return bch2_run_explicit_recovery_pass_printbuf(c, out, pass); } +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass) +{ + if (c->sb.recovery_passes_required & BIT_ULL(pass)) + return 0; + + mutex_lock(&c->sb_lock); + int ret = __bch2_run_explicit_recovery_pass_persistent(c, out, pass); + mutex_unlock(&c->sb_lock); + + return ret; +} + static void bch2_clear_recovery_pass_required(struct bch_fs *c, enum bch_recovery_pass pass) { diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 1f91be4258c5..94fbc64e9b7e 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -13,6 +13,8 @@ int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *, enum bch_recovery_pass); int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); +int __bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass); int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf *, enum bch_recovery_pass); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index d53cbc5f9925..8730d2e78d1d 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -623,6 +623,9 @@ static void bch2_sb_update(struct bch_fs *c) struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); if (ext) { + c->sb.recovery_passes_required = + bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); From 7677859a47a464f1c5603077809d4bc13f2d549f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 26 Apr 2025 12:39:17 -0400 Subject: [PATCH 097/218] bcachefs: Run most explicit recovery passes persistent If we detect an error that requires running a recovery pass, and we're not in recovery, we won't be able to fix it until the next mount - make sure we're noting in the superblock that it needs to run. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 4 ++-- fs/bcachefs/error.c | 2 +- fs/bcachefs/recovery_passes.c | 2 +- fs/bcachefs/recovery_passes.h | 3 --- fs/bcachefs/sb-members.c | 2 +- fs/bcachefs/subvolume.c | 2 +- 6 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 3ec33a7e9d92..596edc7bba2f 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -399,7 +399,7 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf bool print = __bch2_count_fsck_err(c, id, buf); - int ret = bch2_run_explicit_recovery_pass_printbuf(c, buf, + int ret = bch2_run_explicit_recovery_pass_persistent(c, buf, BCH_RECOVERY_PASS_check_allocations); if (insert) { @@ -972,7 +972,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); - bch2_run_explicit_recovery_pass_printbuf(c, &buf, + bch2_run_explicit_recovery_pass_persistent(c, &buf, BCH_RECOVERY_PASS_check_allocations); if (print) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 731733e12e6b..d7bc70fd7762 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -104,7 +104,7 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) __bch2_inconsistent_error(c, out); return -BCH_ERR_btree_need_topology_repair; } else { - return bch2_run_explicit_recovery_pass_printbuf(c, out, BCH_RECOVERY_PASS_check_topology) ?: + return bch2_run_explicit_recovery_pass_persistent(c, out, BCH_RECOVERY_PASS_check_topology) ?: -BCH_ERR_btree_node_read_validate_error; } } diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 97af1e0629eb..e14aca00cb7d 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -162,7 +162,7 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out, } } -int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *c, +static int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *c, struct printbuf *out, enum bch_recovery_pass pass) { diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 94fbc64e9b7e..f33dd005beb4 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -8,9 +8,6 @@ u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); -int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *, - struct printbuf *, - enum bch_recovery_pass); int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); int __bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf *, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 462a2c21a9de..9ab4d9a4b421 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -20,7 +20,7 @@ int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); - int ret = bch2_run_explicit_recovery_pass_printbuf(c, &buf, + int ret = bch2_run_explicit_recovery_pass_persistent(c, &buf, BCH_RECOVERY_PASS_check_allocations); if (print) diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index ff20ce98a476..51ab2ee10706 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -23,7 +23,7 @@ static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) prt_printf(&buf, "missing subvolume %u", subvolid); bool print = bch2_count_fsck_err(c, subvol_missing, &buf); - int ret = bch2_run_explicit_recovery_pass_printbuf(c, &buf, + int ret = bch2_run_explicit_recovery_pass_persistent(c, &buf, BCH_RECOVERY_PASS_check_inodes); if (print) bch2_print_str(c, KERN_ERR, buf.buf); From cf95296295bebadcf8b4a695064d2df35e0c127e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 28 Apr 2025 11:45:56 -0400 Subject: [PATCH 098/218] bcachefs: bch2_trans_update_ip() Allow btree_insert_entry.ip_allocated to be passed in, so we get better info on where alloc updates are coming from. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- fs/bcachefs/btree_update.c | 7 ++++--- fs/bcachefs/btree_update.h | 12 ++++++++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index c63348c4b874..002e3853f8cf 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -484,7 +484,7 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - ret = bch2_trans_update(trans, &iter, &a->k_i, flags); + ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); bch2_trans_iter_exit(trans, &iter); return unlikely(ret) ? ERR_PTR(ret) : a; } diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 2bffd5121c31..ce83cd037551 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -511,8 +511,9 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, return 0; } -int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags) +int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, + unsigned long ip) { kmsan_check_memory(k, bkey_bytes(&k->k)); @@ -548,7 +549,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter path_idx = iter->key_cache_path; } - return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_); + return bch2_trans_update_by_path(trans, path_idx, k, flags, ip); } int bch2_btree_insert_clone_trans(struct btree_trans *trans, diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index e674419c299e..62d24b081e27 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -102,8 +102,16 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter * int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos); -int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_iter_update_trigger_flags); +int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_iter_update_trigger_flags, + unsigned long); + +static inline int __must_check +bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) +{ + return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_); +} struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); From a349868b5e2503271bedf5f0b6e3638552047e0f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 28 Apr 2025 14:50:07 -0400 Subject: [PATCH 099/218] bcachefs: bch2_fs_open() now takes a darray Signed-off-by: Kent Overstreet --- fs/bcachefs/darray.h | 1 + fs/bcachefs/fs.c | 4 ++-- fs/bcachefs/fsck.c | 4 ++-- fs/bcachefs/super.c | 24 ++++++++++++------------ fs/bcachefs/super.h | 2 +- fs/bcachefs/util.c | 4 ++-- fs/bcachefs/util.h | 4 ++-- 7 files changed, 22 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 88f0ca3f0af5..50ec3decfe8c 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -21,6 +21,7 @@ struct { \ typedef DARRAY(char) darray_char; typedef DARRAY(char *) darray_str; +typedef DARRAY(const char *) darray_const_str; typedef DARRAY(u8) darray_u8; typedef DARRAY(u16) darray_u16; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index cdf84180829a..9916bd38a599 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2441,7 +2441,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) struct inode *vinode; struct bch2_opts_parse *opts_parse = fc->fs_private; struct bch_opts opts = opts_parse->opts; - darray_str devs; + darray_const_str devs; darray_fs devs_to_fs = {}; int ret; @@ -2465,7 +2465,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (!IS_ERR(sb)) goto got_sb; - c = bch2_fs_open(devs.data, devs.nr, opts); + c = bch2_fs_open(&devs, &opts); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index d927fdafd43a..ef2d6cbffcc2 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3059,7 +3059,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) { struct bch_ioctl_fsck_offline arg; struct fsck_thread *thr = NULL; - darray_str(devs) = {}; + darray_const_str devs = {}; long ret = 0; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -3117,7 +3117,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); - thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + thr->c = bch2_fs_open(&devs, &thr->opts); if (!IS_ERR(thr->c) && thr->c->opts.errors == BCH_ON_ERROR_panic) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index f29965469b28..5fcd7099bc6a 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -807,7 +807,7 @@ static int bch2_fs_init_rw(struct bch_fs *c) return 0; } -static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts, +static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch_sb_handles *sbs) { struct bch_fs *c; @@ -821,7 +821,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts, goto out; } - c->stdio = (void *)(unsigned long) opts.stdio; + c->stdio = (void *)(unsigned long) opts->stdio; __module_get(THIS_MODULE); @@ -921,7 +921,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts, if (ret) goto err; - bch2_opts_apply(&c->opts, opts); + bch2_opts_apply(&c->opts, *opts); c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) @@ -2273,8 +2273,8 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); } -struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, - struct bch_opts opts) +struct bch_fs *bch2_fs_open(darray_const_str *devices, + struct bch_opts *opts) { bch_sb_handles sbs = {}; struct bch_fs *c = NULL; @@ -2285,26 +2285,26 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, if (!try_module_get(THIS_MODULE)) return ERR_PTR(-ENODEV); - if (!nr_devices) { + if (!devices->nr) { ret = -EINVAL; goto err; } - ret = darray_make_room(&sbs, nr_devices); + ret = darray_make_room(&sbs, devices->nr); if (ret) goto err; - for (unsigned i = 0; i < nr_devices; i++) { + darray_for_each(*devices, i) { struct bch_sb_handle sb = { NULL }; - ret = bch2_read_super(devices[i], &opts, &sb); + ret = bch2_read_super(*i, opts, &sb); if (ret) goto err; BUG_ON(darray_push(&sbs, sb)); } - if (opts.nochanges && !opts.read_only) { + if (opts->nochanges && !opts->read_only) { ret = -BCH_ERR_erofs_nochanges; goto err_print; } @@ -2314,7 +2314,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, best = sb; darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb, &opts); + ret = bch2_dev_in_fs(best, sb, opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { @@ -2358,7 +2358,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, return c; err_print: pr_err("bch_fs_open err opening %s: %s", - devices[0], bch2_err_str(ret)); + devices->data[0], bch2_err_str(ret)); err: if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index a1566f2d77c3..be75603fefe9 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -45,7 +45,7 @@ void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); -struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); +struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 1cff407c8c9d..dc3817f545fa 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -1016,14 +1016,14 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) return ret; } -void bch2_darray_str_exit(darray_str *d) +void bch2_darray_str_exit(darray_const_str *d) { darray_for_each(*d, i) kfree(*i); darray_exit(d); } -int bch2_split_devs(const char *_dev_name, darray_str *ret) +int bch2_split_devs(const char *_dev_name, darray_const_str *ret) { darray_init(ret); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 7a93e187a49a..14cb2c7dfda4 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -690,8 +690,8 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) return l.len == r.len && !memcmp(l.name, r.name, l.len); } -void bch2_darray_str_exit(darray_str *); -int bch2_split_devs(const char *, darray_str *); +void bch2_darray_str_exit(darray_const_str *); +int bch2_split_devs(const char *, darray_const_str *); #ifdef __KERNEL__ From 98e5e36d8c58ab41c28367d3bfc9ec4e8795e421 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 28 Apr 2025 22:00:01 -0400 Subject: [PATCH 100/218] bcachefs: bch2_dev_add() can run on a non-started fs Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 5fcd7099bc6a..e89b659514b2 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1894,6 +1894,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_unlock; } unsigned dev_idx = ret; + ret = 0; /* success: */ @@ -1913,27 +1914,29 @@ int bch2_dev_add(struct bch_fs *c, const char *path) bch2_write_super(c); mutex_unlock(&c->sb_lock); - ret = bch2_dev_usage_init(ca, false); - if (ret) - goto err_late; + if (test_bit(BCH_FS_started, &c->flags)) { + ret = bch2_dev_usage_init(ca, false); + if (ret) + goto err_late; - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - bch_err_msg(ca, ret, "marking new superblock"); - if (ret) - goto err_late; + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); + bch_err_msg(ca, ret, "marking new superblock"); + if (ret) + goto err_late; - ret = bch2_fs_freespace_init(c); - bch_err_msg(ca, ret, "initializing free space"); - if (ret) - goto err_late; + ret = bch2_fs_freespace_init(c); + bch_err_msg(ca, ret, "initializing free space"); + if (ret) + goto err_late; - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); - ret = bch2_dev_journal_alloc(ca, false); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err_late; + ret = bch2_dev_journal_alloc(ca, false); + bch_err_msg(c, ret, "allocating journal"); + if (ret) + goto err_late; + } up_write(&c->state_lock); out: From ae0386e111253eee0f71ae3f32635a3ba22e5a7b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 30 Apr 2025 13:22:01 -0600 Subject: [PATCH 101/218] bcachefs: Avoid -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Refactor a couple of structs that contain flexible arrays in the middle by replacing them with unions. So, with these changes, fix the following warnings: fs/bcachefs/disk_accounting.c:429:51: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] fs/bcachefs/ec_types.h:8:41: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 14 ++++++++------ fs/bcachefs/ec_types.h | 7 ++++--- fs/bcachefs/journal_io.c | 6 +++--- fs/bcachefs/journal_reclaim.c | 2 +- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index e399237e124a..195dc3fcec1d 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -287,7 +287,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) { - struct bch_replicas_padded r; + union bch_replicas_padded r; return accounting_to_replicas(&r.e, p) ? bch2_mark_replicas(c, &r.e) : 0; @@ -361,7 +361,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { - struct bch_replicas_padded r; + union bch_replicas_padded r; if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && @@ -379,7 +379,7 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { - struct bch_replicas_padded r; + union bch_replicas_padded r; if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && @@ -438,10 +438,12 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) percpu_down_read(&c->mark_lock); darray_for_each(acc->k, i) { - struct { + union { + u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, + BCH_BKEY_PTRS_MAX)]; struct bch_replicas_usage r; - u8 pad[BCH_BKEY_PTRS_MAX]; } u; + u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; if (!accounting_to_replicas(&u.r.r, i->pos)) continue; @@ -640,7 +642,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, switch (acc->type) { case BCH_DISK_ACCOUNTING_replicas: { - struct bch_replicas_padded r; + union bch_replicas_padded r; __accounting_to_replicas(&r.e, acc); for (unsigned i = 0; i < r.e.nr_devs; i++) diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 06144bfd9c19..809446c78951 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -4,9 +4,10 @@ #include "bcachefs_format.h" -struct bch_replicas_padded { +union bch_replicas_padded { + u8 bytes[struct_size_t(struct bch_replicas_entry_v1, + devs, BCH_BKEY_PTRS_MAX)]; struct bch_replicas_entry_v1 e; - u8 pad[BCH_BKEY_PTRS_MAX]; }; struct stripe { @@ -28,7 +29,7 @@ struct gc_stripe { u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; - struct bch_replicas_padded r; + union bch_replicas_padded r; }; #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 8f38e9485cd8..be86fd21de2a 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1406,7 +1406,7 @@ int bch2_journal_read(struct bch_fs *c, } genradix_for_each(&c->journal_entries, radix_iter, _i) { - struct bch_replicas_padded replicas = { + union bch_replicas_padded replicas = { .e.data_type = BCH_DATA_journal, .e.nr_devs = 0, .e.nr_required = 1, @@ -1634,7 +1634,7 @@ static CLOSURE_CALLBACK(journal_write_done) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); int err = 0; @@ -2057,7 +2057,7 @@ CLOSURE_CALLBACK(bch2_journal_write) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]); int ret; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index bb339be54e7b..ce9e0bd7ec4f 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -963,7 +963,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) seq = 0; spin_lock(&j->lock); while (!ret) { - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; seq = max(seq, journal_last_seq(j)); if (seq >= j->pin.back) From 5ce11d9d1bd5dfd8876d35bd9e61f38f47807c42 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 29 Apr 2025 14:41:37 -0400 Subject: [PATCH 102/218] bcachefs: sysfs trigger_recalc_capacity For bug diagnosis Signed-off-by: Kent Overstreet --- fs/bcachefs/sysfs.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index dfae5eda7a4c..1d0c0f24a7b9 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -146,8 +146,9 @@ write_attribute(trigger_journal_flush); write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); -write_attribute(trigger_freelist_wakeup); write_attribute(trigger_btree_updates); +write_attribute(trigger_freelist_wakeup); +write_attribute(trigger_recalc_capacity); read_attribute(gc_gens_pos); read_attribute(uuid); @@ -428,6 +429,12 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_freelist_wakeup) closure_wake_up(&c->freelist_wait); + if (attr == &sysfs_trigger_recalc_capacity) { + down_read(&c->state_lock); + bch2_recalc_capacity(c); + up_read(&c->state_lock); + } + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -553,8 +560,9 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_journal_writes, &sysfs_trigger_btree_cache_shrink, &sysfs_trigger_btree_key_cache_shrink, - &sysfs_trigger_freelist_wakeup, &sysfs_trigger_btree_updates, + &sysfs_trigger_freelist_wakeup, + &sysfs_trigger_recalc_capacity, &sysfs_gc_gens_pos, From 8a6b883e78bfed6909e21c2afb6138b603d1ee6c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 30 Apr 2025 22:05:49 -0400 Subject: [PATCH 103/218] bcachefs: Fix setting ca->name in device add Device add doesn't get the devide index and attach to the filesystem until after attaching the block device, and setting the device name from the block device name - these needs some minor tweaks. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index e89b659514b2..9381644cabee 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1488,7 +1488,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, { ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); - scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); + + if (!ca->name[0]) + scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); @@ -1540,6 +1542,11 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) if (ret) return ret; + struct printbuf name = PRINTBUF; + prt_bdevname(&name, sb->bdev); + strscpy(ca->name, name.buf, sizeof(ca->name)); + printbuf_exit(&name); + /* Commit: */ ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); @@ -1581,11 +1588,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) bch2_dev_sysfs_online(c, ca); - struct printbuf name = PRINTBUF; - prt_bdevname(&name, ca->disk_sb.bdev); - strscpy(ca->name, name.buf, sizeof(ca->name)); - printbuf_exit(&name); - bch2_rebalance_wakeup(c); return 0; } From c53e5c0c191ec6cc85c630249cbd68a2adb3f715 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 30 Apr 2025 23:10:44 -0400 Subject: [PATCH 104/218] docs: bcachefs: add casefolding reference Signed-off-by: Kent Overstreet --- .../filesystems/bcachefs/casefolding.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst index ba5de97d155f..871a38f557e8 100644 --- a/Documentation/filesystems/bcachefs/casefolding.rst +++ b/Documentation/filesystems/bcachefs/casefolding.rst @@ -88,3 +88,21 @@ This would fail if negative dentry's were cached. This is slightly suboptimal, but could be fixed in future with some vfs work. + +References +---------- + +(from Peter Anvin, on the list) + +It is worth noting that Microsoft has basically declared their +"recommended" case folding (upcase) table to be permanently frozen (for +new filesystem instances in the case where they use an on-disk +translation table created at format time.) As far as I know they have +never supported anything other than 1:1 conversion of BMP code points, +nor normalization. + +The exFAT specification enumerates the full recommended upcase table, +although in a somewhat annoying format (basically a hex dump of +compressed data): + +https://learn.microsoft.com/en-us/windows/win32/fileio/exfat-specification From a42f709f9ac1dc7b4ff32ab428acf7abaf3358b9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 30 Apr 2025 22:26:00 -0400 Subject: [PATCH 105/218] bcachefs: Improve bch2_disk_groups_to_text() Print out the actual name of each path/label, instead of just the integer indexes. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 111 ++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 58 deletions(-) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 4e2f237338c2..c1a2a957c884 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -86,35 +86,6 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field * return ret; } -void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) -{ - out->atomic++; - rcu_read_lock(); - - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - if (!g) - goto out; - - for (unsigned i = 0; i < g->nr; i++) { - if (i) - prt_printf(out, " "); - - if (g->entries[i].deleted) { - prt_printf(out, "[deleted]"); - continue; - } - - prt_printf(out, "[parent %d devs", g->entries[i].parent); - for_each_member_device_rcu(c, ca, &g->entries[i].devs) - prt_printf(out, " %s", ca->name); - prt_printf(out, "]"); - } - -out: - rcu_read_unlock(); - out->atomic--; -} - static void bch2_sb_disk_groups_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) @@ -241,17 +212,14 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) case TARGET_DEV: return dev == t.dev; case TARGET_GROUP: { - struct bch_disk_groups_cpu *g; - const struct bch_devs_mask *m; - bool ret; - rcu_read_lock(); - g = rcu_dereference(c->disk_groups); - m = g && t.group < g->nr && !g->entries[t.group].deleted + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + const struct bch_devs_mask *m = + g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - ret = m ? test_bit(dev, m->d) : false; + bool ret = m ? test_bit(dev, m->d) : false; rcu_read_unlock(); return ret; @@ -377,54 +345,81 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) return v; } -void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g, + unsigned v) { - struct bch_disk_groups_cpu *groups; - struct bch_disk_group_cpu *g; - unsigned nr = 0; u16 path[32]; - - out->atomic++; - rcu_read_lock(); - groups = rcu_dereference(c->disk_groups); - if (!groups) - goto invalid; + unsigned nr = 0; while (1) { if (nr == ARRAY_SIZE(path)) goto invalid; - if (v >= groups->nr) + if (v >= (g ? g->nr : 0)) goto invalid; - g = groups->entries + v; + struct bch_disk_group_cpu *e = g->entries + v; - if (g->deleted) + if (e->deleted) goto invalid; path[nr++] = v; - if (!g->parent) + if (!e->parent) break; - v = g->parent - 1; + v = e->parent - 1; } while (nr) { - v = path[--nr]; - g = groups->entries + v; + struct bch_disk_group_cpu *e = g->entries + path[--nr]; - prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); + prt_printf(out, "%.*s", (int) sizeof(e->label), e->label); if (nr) prt_printf(out, "."); } -out: - rcu_read_unlock(); - out->atomic--; return; invalid: prt_printf(out, "invalid label %u", v); - goto out; +} + +void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) +{ + bch2_printbuf_make_room(out, 4096); + + out->atomic++; + rcu_read_lock(); + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + + for (unsigned i = 0; i < (g ? g->nr : 0); i++) { + prt_printf(out, "%2u: ", i); + + if (g->entries[i].deleted) { + prt_printf(out, "[deleted]"); + goto next; + } + + __bch2_disk_path_to_text(out, g, i); + + prt_printf(out, " devs"); + + for_each_member_device_rcu(c, ca, &g->entries[i].devs) + prt_printf(out, " %s", ca->name); +next: + prt_newline(out); + } + + rcu_read_unlock(); + out->atomic--; +} + +void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +{ + out->atomic++; + rcu_read_lock(); + __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v), + rcu_read_unlock(); + --out->atomic; } void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) From 9180c5f91804e9b381d11b6c19cbefa5c5490b2b Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Fri, 2 May 2025 04:01:32 +0800 Subject: [PATCH 106/218] bcachefs: Rename x_name to x_name_and_value The flexible array contains name and value, the x_name is misleading. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/xattr.c | 16 ++++++++-------- fs/bcachefs/xattr.h | 4 ++-- fs/bcachefs/xattr_format.h | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 423ace6272be..6b64cec78fb9 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); return bch2_xattr_hash(info, - &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); + &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len)); } static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) @@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) return l.v->x_type != r->type || l.v->x_name_len != r->name.len || - memcmp(l.v->x_name, r->name.name, r->name.len); + memcmp(l.v->x_name_and_value, r->name.name, r->name.len); } static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) @@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) return l.v->x_type != r.v->x_type || l.v->x_name_len != r.v->x_name_len || - memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); + memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len); } const struct bch_hash_desc bch2_xattr_hash_desc = { @@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, c, xattr_invalid_type, "invalid type (%u)", xattr.v->x_type); - bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), + bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len), c, xattr_name_invalid_chars, "xattr name has invalid characters"); fsck_err: @@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, unsigned name_len = xattr.v->x_name_len; unsigned val_len = le16_to_cpu(xattr.v->x_val_len); unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - - offsetof(struct bch_xattr, x_name); + offsetof(struct bch_xattr, x_name_and_value); val_len = min_t(int, val_len, max_name_val_bytes - name_len); name_len = min(name_len, max_name_val_bytes); prt_printf(out, "%.*s:%.*s", - name_len, xattr.v->x_name, + name_len, xattr.v->x_name_and_value, val_len, (char *) xattr_val(xattr.v)); if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || @@ -202,7 +202,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, xattr->v.x_type = type; xattr->v.x_name_len = namelen; xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name, name, namelen); + memcpy(xattr->v.x_name_and_value, name, namelen); memcpy(xattr_val(&xattr->v), value, size); ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, @@ -270,7 +270,7 @@ static int bch2_xattr_emit(struct dentry *dentry, if (!prefix) return 0; - return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf); + return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf); } static int bch2_xattr_list_bcachefs(struct bch_fs *c, diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 132fbbd15a66..1139bf345f70 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) { - return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + + return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) + name_len + val_len, sizeof(u64)); } #define xattr_val(_xattr) \ - ((void *) (_xattr)->x_name + (_xattr)->x_name_len) + ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len) struct xattr_search_key { u8 type; diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h index 67426e33d04e..4121b78d9a92 100644 --- a/fs/bcachefs/xattr_format.h +++ b/fs/bcachefs/xattr_format.h @@ -16,10 +16,10 @@ struct bch_xattr { /* * x_name contains the name and value counted by * x_name_len + x_val_len. The introduction of - * __counted_by(x_name_len) caused a false positive + * __counted_by(x_name_len) previously caused a false positive * detection of an out of bounds write. */ - __u8 x_name[]; + __u8 x_name_and_value[]; } __packed __aligned(8); #endif /* _BCACHEFS_XATTR_FORMAT_H */ From e3006cb010150cefde5739d2b8c2e8f7b876eb84 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 16 Apr 2025 09:28:10 -0400 Subject: [PATCH 107/218] bcachefs: Don't emit bch_sb_field_members_v1 if not required In 'bcachefs_metadata_extent_flags', we stopped requireding members_v1 to be present - only that either v1 or v2 is present. Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-members.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 9ab4d9a4b421..f6a0b3de6bca 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -139,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) struct bch_sb_field_members_v1 *mi1; struct bch_sb_field_members_v2 *mi2; + if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { + bch2_sb_field_resize(disk_sb, members_v1, 0); + return 0; + } + mi1 = bch2_sb_field_resize(disk_sb, members_v1, DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * disk_sb->sb->nr_devices, sizeof(u64))); From 15dbd0d8146356a43003784ce38aabae32e96197 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 1 May 2025 14:47:39 -0400 Subject: [PATCH 108/218] bcachefs: snapshot delete progress indicator Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 3 +- fs/bcachefs/snapshot.c | 142 ++++++++++++++++++++++------------- fs/bcachefs/snapshot.h | 7 +- fs/bcachefs/snapshot_types.h | 25 ++++++ fs/bcachefs/subvolume.c | 2 - fs/bcachefs/subvolume.h | 3 - fs/bcachefs/super.c | 1 + fs/bcachefs/sysfs.c | 5 ++ 8 files changed, 127 insertions(+), 61 deletions(-) create mode 100644 fs/bcachefs/snapshot_types.h diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 0369dd656d32..cd35d1cf3fbb 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -216,6 +216,7 @@ #include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" +#include "snapshot_types.h" #include "time_stats.h" #include "util.h" @@ -869,7 +870,7 @@ struct bch_fs { struct mutex snapshot_table_lock; struct rw_semaphore snapshot_create_lock; - struct work_struct snapshot_delete_work; + struct snapshot_delete snapshot_delete; struct work_struct snapshot_wait_for_pagecache_and_delete_work; snapshot_id_list snapshots_unlinked; struct mutex snapshots_unlinked_lock; diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 94cf60f76b64..aaf64271c041 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_key_cache.h" @@ -1346,12 +1347,6 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -struct snapshot_interior_delete { - u32 id; - u32 live_child; -}; -typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; - static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) { darray_for_each(*l, i) @@ -1385,28 +1380,28 @@ static unsigned __live_child(struct snapshot_table *t, u32 id, return 0; } -static unsigned live_child(struct bch_fs *c, u32 id, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static unsigned live_child(struct bch_fs *c, u32 id) { + struct snapshot_delete *d = &c->snapshot_delete; + rcu_read_lock(); u32 ret = __live_child(rcu_dereference(c->snapshots), id, - delete_leaves, delete_interior); + &d->delete_leaves, &d->delete_interior); rcu_read_unlock(); return ret; } static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) + struct bkey_s_c k) { - if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) + struct snapshot_delete *d = &trans->c->snapshot_delete; + + if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); + u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot); if (live_child) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); int ret = PTR_ERR_OR_ZERO(new); @@ -1442,44 +1437,46 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, * it doesn't have child snapshot nodes - it's now redundant and we can mark it * as deleted. */ -static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k) { if (k.k->type != KEY_TYPE_snapshot) return 0; struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); unsigned live_children = 0; + int ret = 0; if (BCH_SNAPSHOT_SUBVOL(s.v)) return 0; + mutex_lock(&d->progress_lock); for (unsigned i = 0; i < 2; i++) { u32 child = le32_to_cpu(s.v->children[i]); live_children += child && - !snapshot_list_has_id(delete_leaves, child); + !snapshot_list_has_id(&d->delete_leaves, child); } if (live_children == 0) { - return snapshot_list_add(c, delete_leaves, s.k->p.offset); + ret = snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); } else if (live_children == 1) { - struct snapshot_interior_delete d = { + struct snapshot_interior_delete n = { .id = s.k->p.offset, - .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), + .live_child = live_child(c, s.k->p.offset), }; - if (!d.live_child) { - bch_err(c, "error finding live child of snapshot %u", d.id); - return -EINVAL; + if (!n.live_child) { + bch_err(c, "error finding live child of snapshot %u", n.id); + ret = -EINVAL; + } else { + ret = darray_push(&d->delete_interior, n); } - - return darray_push(delete_interior, d); - } else { - return 0; } + mutex_unlock(&d->progress_lock); + + return ret; } static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, @@ -1555,39 +1552,48 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, return bch2_trans_update(trans, iter, &s->k_i, 0); } +static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) +{ + prt_printf(out, "deleting leaves"); + darray_for_each(d->delete_leaves, i) + prt_printf(out, " %u", *i); + prt_newline(out); + + prt_printf(out, "interior"); + darray_for_each(d->delete_interior, i) + prt_printf(out, " %u->%u", i->id, i->live_child); + prt_newline(out); +} + int bch2_delete_dead_snapshots(struct bch_fs *c) { if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; struct btree_trans *trans = bch2_trans_get(c); - snapshot_id_list delete_leaves = {}; - interior_delete_list delete_interior = {}; + struct snapshot_delete *d = &c->snapshot_delete; int ret = 0; /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ + d->running = true; + d->pos = BBPOS_MIN; + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, - check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); + check_should_delete_snapshot(trans, k)); if (!bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err; - if (!delete_leaves.nr && !delete_interior.nr) + if (!d->delete_leaves.nr && !d->delete_interior.nr) goto err; { struct printbuf buf = PRINTBUF; - prt_printf(&buf, "deleting leaves"); - darray_for_each(delete_leaves, i) - prt_printf(&buf, " %u", *i); - - prt_printf(&buf, " interior"); - darray_for_each(delete_interior, i) - prt_printf(&buf, " %u->%u", i->id, i->live_child); + bch2_snapshot_delete_nodes_to_text(&buf, d); ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); printbuf_exit(&buf); @@ -1595,19 +1601,21 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } - for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { + for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { struct disk_reservation res = { 0 }; - if (!btree_type_has_snapshots(btree)) + d->pos.pos = POS_MIN; + + if (!btree_type_has_snapshots(d->pos.btree)) continue; ret = for_each_btree_key_commit(trans, iter, - btree, POS_MIN, + d->pos.btree, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, - delete_dead_snapshots_process_key(trans, &iter, k, - &delete_leaves, - &delete_interior)); + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + delete_dead_snapshots_process_key(trans, &iter, k); + })); bch2_disk_reservation_put(c, &res); @@ -1617,7 +1625,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } - darray_for_each(delete_leaves, i) { + darray_for_each(d->delete_leaves, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); if (!bch2_err_matches(ret, EROFS)) @@ -1634,11 +1642,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); if (ret) goto err; - darray_for_each(delete_interior, i) { + darray_for_each(d->delete_interior, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, i->id)); if (!bch2_err_matches(ret, EROFS)) @@ -1647,8 +1655,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } err: - darray_exit(&delete_interior); - darray_exit(&delete_leaves); + mutex_lock(&d->progress_lock); + darray_exit(&d->delete_interior); + darray_exit(&d->delete_leaves); + d->running = false; + mutex_unlock(&d->progress_lock); bch2_trans_put(trans); if (!bch2_err_matches(ret, EROFS)) bch_err_fn(c, ret); @@ -1657,7 +1668,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) void bch2_delete_dead_snapshots_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); @@ -1672,10 +1683,26 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c) BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + if (!queue_work(c->write_ref_wq, &c->snapshot_delete.work)) enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); } +void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct snapshot_delete *d = &c->snapshot_delete; + + if (!d->running) { + prt_str(out, "(not running)"); + return; + } + + mutex_lock(&d->progress_lock); + bch2_snapshot_delete_nodes_to_text(out, d); + + bch2_bbpos_to_text(out, d->pos); + mutex_unlock(&d->progress_lock); +} + int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, enum btree_id id, struct bpos pos) @@ -1750,3 +1777,10 @@ void bch2_fs_snapshots_exit(struct bch_fs *c) { kvfree(rcu_dereference_protected(c->snapshots, true)); } + +void bch2_fs_snapshots_init_early(struct bch_fs *c) +{ + INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); + mutex_init(&c->snapshot_delete.progress_lock); + mutex_init(&c->snapshots_unlinked_lock); +} diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 81180181d7c9..24a451bb7024 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -244,7 +244,6 @@ int bch2_reconstruct_snapshots(struct bch_fs *); int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); -void bch2_delete_dead_snapshots_work(struct work_struct *); int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); @@ -259,7 +258,13 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } +int bch2_delete_dead_snapshots(struct bch_fs *); +void bch2_delete_dead_snapshots_work(struct work_struct *); +void bch2_delete_dead_snapshots_async(struct bch_fs *); +void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); + int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); +void bch2_fs_snapshots_init_early(struct bch_fs *); #endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h new file mode 100644 index 000000000000..62def3d7e0ed --- /dev/null +++ b/fs/bcachefs/snapshot_types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_TYPES_H +#define _BCACHEFS_SNAPSHOT_TYPES_H + +#include "bbpos_types.h" +#include "subvolume_types.h" + +struct snapshot_interior_delete { + u32 id; + u32 live_child; +}; +typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; + +struct snapshot_delete { + struct work_struct work; + + struct mutex progress_lock; + snapshot_id_list delete_leaves; + interior_delete_list delete_interior; + + bool running; + struct bbpos pos; +}; + +#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 51ab2ee10706..3c6ba1469de2 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -730,8 +730,6 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) void bch2_fs_subvolumes_init_early(struct bch_fs *c) { - INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, bch2_subvolume_wait_for_pagecache_and_delete); - mutex_init(&c->snapshots_unlinked_lock); } diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index ee5e4e5a0fc8..075f55e25c70 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -77,9 +77,6 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btr _end, _subvolid, _flags, _k, _do); \ }) -int bch2_delete_dead_snapshots(struct bch_fs *); -void bch2_delete_dead_snapshots_async(struct bch_fs *); - int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 9381644cabee..45e2b2bc8c65 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -868,6 +868,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_fs_quota_init(c); bch2_fs_rebalance_init(c); bch2_fs_sb_errors_init_early(c); + bch2_fs_snapshots_init_early(c); bch2_fs_subvolumes_init_early(c); INIT_LIST_HEAD(&c->list); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 1d0c0f24a7b9..adf99a805a62 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -198,6 +198,7 @@ read_attribute(copy_gc_wait); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); +read_attribute(snapshot_delete_status); read_attribute(new_stripes); @@ -320,6 +321,9 @@ SHOW(bch2_fs) if (attr == &sysfs_rebalance_status) bch2_rebalance_status_to_text(out, c); + if (attr == &sysfs_snapshot_delete_status) + bch2_snapshot_delete_status_to_text(out, c); + /* Debugging: */ if (attr == &sysfs_journal_debug) @@ -466,6 +470,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_btree_write_stats, &sysfs_rebalance_status, + &sysfs_snapshot_delete_status, &sysfs_compression_stats, From 7b8c41c178742c680e6acb610a760f9e007cfeac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 2 May 2025 12:23:59 -0400 Subject: [PATCH 109/218] bcachefs: Add comments for inode snapshot requirements Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 6 ++++++ fs/bcachefs/xattr.c | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index add141ac45b5..399df8fede8b 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -279,6 +279,12 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, inode_update_flags = 0; } + /* + * extents, dirents and xattrs updates require that an inode update also + * happens - to ensure that if a key exists in one of those btrees with + * a given snapshot ID an inode is also present - so we may have to skip + * the nojournal optimization: + */ if (inode->k.p.snapshot != iter.snapshot) { inode->k.p.snapshot = iter.snapshot; inode_update_flags = 0; diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 6b64cec78fb9..627f153798c6 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -176,6 +176,11 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, if (ret) return ret; + /* + * Besides the ctime update, extents, dirents and xattrs updates require + * that an inode update also happens - to ensure that if a key exists in + * one of those btrees with a given snapshot ID an inode is also present + */ inode_u->bi_ctime = bch2_current_time(c); ret = bch2_inode_write(trans, &inode_iter, inode_u); From 6f2bbd57474b3f8448c30cb1b986fbf127bb103e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 1 May 2025 15:18:40 -0400 Subject: [PATCH 110/218] bcachefs: kill inode_walker_entry.snapshot redundant Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index ef2d6cbffcc2..256e3907cd04 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -787,12 +787,11 @@ static int ref_visible2(struct bch_fs *c, #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ - (_i)->snapshot <= (_snapshot); _i++) \ - if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ + if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) struct inode_walker_entry { struct bch_inode_unpacked inode; - u32 snapshot; u64 count; u64 i_size; }; @@ -826,7 +825,6 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, return bch2_inode_unpack(inode, &u) ?: darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, - .snapshot = inode.k->p.snapshot, })); } @@ -872,19 +870,19 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_ struct inode_walker_entry *i; __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)) goto found; return NULL; found: - BUG_ON(k.k->p.snapshot > i->snapshot); + BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot); - if (k.k->p.snapshot != i->snapshot && !is_whiteout) { + if (k.k->p.snapshot != i->inode.bi_snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - new.snapshot = k.k->p.snapshot; - new.count = 0; - new.i_size = 0; + new.inode.bi_snapshot = k.k->p.snapshot; + new.count = 0; + new.i_size = 0; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); @@ -892,10 +890,10 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" "unexpected because we should always update the inode when we update a key in that inode\n" "%s", - w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); + w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, buf.buf); printbuf_exit(&buf); - while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) + while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) --i; size_t pos = i - w->inodes.data; @@ -1498,21 +1496,21 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal if (i->inode.bi_sectors == i->count) continue; - count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); + count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); if (w->recalculate_sums) i->count = count2; if (i->count != count2) { bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); i->count = count2; } if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), trans, inode_i_sectors_wrong, "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, + w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; ret = bch2_fsck_write_inode(trans, &i->inode); @@ -1823,20 +1821,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + if (i->inode.bi_snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n%s", - i->inode.bi_inum, i->snapshot, i->inode.bi_size, + i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { struct btree_iter iter2; bch2_trans_copy_iter(trans, &iter2, iter); - bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot); + bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot); ret = bch2_btree_iter_traverse(trans, &iter2) ?: bch2_btree_delete_at(trans, &iter2, BTREE_UPDATE_internal_snapshot_node); @@ -1858,8 +1856,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + if (i->inode.bi_snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; i->count += k.k->size; @@ -1941,13 +1939,13 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ if (i->inode.bi_nlink == i->count) continue; - count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); + count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); if (count2 < 0) return count2; if (i->count != count2) { bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); i->count = count2; if (i->inode.bi_nlink == i->count) continue; @@ -1956,7 +1954,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ if (fsck_err_on(i->inode.bi_nlink != i->count, trans, inode_dir_wrong_nlink, "directory %llu:%u with wrong i_nlink: got %u, should be %llu", - w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { + w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; ret = bch2_fsck_write_inode(trans, &i->inode); if (ret) From 3c97ebea61e3ebdbab5b2564296a86d601b632e6 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Sun, 4 May 2025 04:03:42 +0800 Subject: [PATCH 111/218] bcachefs: Fix inconsistent req->ec There is req->ec = erasure_code above. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index e87b95f609c5..2d7f32f9499e 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1255,6 +1255,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, if (unlikely(ret)) return ret; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) + erasure_code = false; + req->nr_replicas = nr_replicas; req->target = target; req->ec = erasure_code; @@ -1262,9 +1265,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, req->flags = flags; req->devs_have = devs_have; - if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) - erasure_code = false; - BUG_ON(!nr_replicas || !nr_replicas_required); retry: req->ptrs.nr = 0; From 00757984d55e769a108fee6cfabee1b289c9516f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 18:16:49 -0400 Subject: [PATCH 112/218] bcachefs: Improve bch2_request_incompat_feature() message Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 8730d2e78d1d..6687b9235d3c 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -87,7 +87,8 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v struct printbuf buf = PRINTBUF; prt_str(&buf, "requested incompat feature "); bch2_version_to_text(&buf, version); - prt_str(&buf, " currently not enabled"); + prt_str(&buf, " currently not enabled, allowed up to "); + bch2_version_to_text(&buf, version); prt_printf(&buf, "\n set version_upgrade=incompat to enable"); bch_notice(c, "%s", buf.buf); From a9421140fc5a5647191704c8a5e93bf2aaeb2c0a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 18:17:26 -0400 Subject: [PATCH 113/218] bcachefs: bch2_inode_unpack() cleanup bi_snapshot is now handled like other fields Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 490b85841de9..96d4ab0148bf 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -241,6 +241,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, u64 v[2]; unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); @@ -285,13 +286,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, { memset(unpacked, 0, sizeof(*unpacked)); - unpacked->bi_snapshot = k.k->p.snapshot; - switch (k.k->type) { case KEY_TYPE_inode: { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= 0; unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); @@ -310,6 +310,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); @@ -327,8 +328,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, int bch2_inode_unpack(struct bkey_s_c k, struct bch_inode_unpacked *unpacked) { - unpacked->bi_snapshot = k.k->p.snapshot; - return likely(k.k->type == KEY_TYPE_inode_v3) ? bch2_inode_unpack_v3(k, unpacked) : bch2_inode_unpack_slowpath(k, unpacked); From 855070dc0b349eb4f17cfbe7e73829b1b2851bdc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 16:48:00 -0400 Subject: [PATCH 114/218] bcachefs: get_inodes_all_snapshots() now includes whiteouts The next patch is going to change lookup_inode_for_snapshot to rigorously require that a extent/dirent/xattr keys have a corresponding inode key present - whiteouts included, so this simplifies the checks lookup_inode_for_snapshot() will have to do. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 115 +++++++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 52 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 256e3907cd04..1daa3d970919 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -792,6 +792,7 @@ static int ref_visible2(struct bch_fs *c, struct inode_walker_entry { struct bch_inode_unpacked inode; + bool whiteout; u64 count; u64 i_size; }; @@ -820,12 +821,20 @@ static struct inode_walker inode_walker_init(void) static int add_inode(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c inode) { - struct bch_inode_unpacked u; - - return bch2_inode_unpack(inode, &u) ?: - darray_push(&w->inodes, ((struct inode_walker_entry) { - .inode = u, + int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { + .whiteout = !bkey_is_inode(inode.k), })); + if (ret) + return ret; + + struct inode_walker_entry *n = &darray_last(w->inodes); + if (!n->whiteout) { + return bch2_inode_unpack(inode, &n->inode); + } else { + n->inode.bi_inum = inode.k->p.inode; + n->inode.bi_snapshot = inode.k->p.snapshot; + return 0; + } } static int get_inodes_all_snapshots(struct btree_trans *trans, @@ -845,13 +854,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, w->recalculate_sums = false; w->inodes.nr = 0; - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) + for_each_btree_key_max_norestart(trans, iter, + BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + ret = add_inode(c, w, k); + if (ret) break; - - if (bkey_is_inode(k.k)) - add_inode(c, w, k); } bch2_trans_iter_exit(trans, &iter); @@ -863,6 +871,41 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return 0; } +static int get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + w->inodes.nr = 0; + w->deletes.nr = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + + if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) + continue; + + if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) + continue; + + ret = bkey_is_inode(k.k) + ? add_inode(c, w, k) + : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + static struct inode_walker_entry * lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) { @@ -922,41 +965,6 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, return lookup_inode_for_snapshot(trans->c, w, k); } -static int get_visible_inodes(struct btree_trans *trans, - struct inode_walker *w, - struct snapshots_seen *s, - u64 inum) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - w->inodes.nr = 0; - w->deletes.nr = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - - if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) - continue; - - if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) - continue; - - ret = bkey_is_inode(k.k) - ? add_inode(c, w, k) - : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - /* * Prefer to delete the first one, since that will be the one at the wrong * offset: @@ -1450,7 +1458,9 @@ static int check_key_has_inode(struct btree_trans *trans, if (k.k->type == KEY_TYPE_whiteout) goto out; - if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + bool have_inode = i && !i->whiteout; + + if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) @@ -1461,14 +1471,14 @@ static int check_key_has_inode(struct btree_trans *trans, goto err; } - if (fsck_err_on(!i, + if (fsck_err_on(!have_inode, trans, key_in_missing_inode, "key in missing inode:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) goto delete; - if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), + if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), trans, key_in_wrong_inode_type, "key for wrong inode mode %o:\n%s", i->inode.bi_mode, @@ -1856,7 +1866,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->inode.bi_snapshot > k.k->p.snapshot || + if (i->whiteout || + i->inode.bi_snapshot > k.k->p.snapshot || !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; @@ -2167,7 +2178,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - if (!i) + if (!i || i->whiteout) goto out; if (dir->first_this_inode) @@ -2342,7 +2353,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - if (!i) + if (!i || i->whiteout) return 0; if (inode->first_this_inode) From 0afdf4969e0ac24f63b499c6c75731564a072eb8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 1 May 2025 15:14:04 -0400 Subject: [PATCH 115/218] bcachefs: BCH_FSCK_ERR_snapshot_key_missing_inode_snapshot We're going to be doing some snapshot deletion performance improvements, and those will strictly require that if an extent/dirent/xattr is present, an inode is present in that snapshot ID. We already check for this, but we don't repair it on disk: this patch adds that repair and turns it into a real fsck_err(). Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 65 +++++++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 1daa3d970919..942a03d06074 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -907,9 +907,9 @@ static int get_visible_inodes(struct btree_trans *trans, } static struct inode_walker_entry * -lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) +lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) { - bool is_whiteout = k.k->type == KEY_TYPE_whiteout; + struct bch_fs *c = trans->c; struct inode_walker_entry *i; __darray_for_each(w->inodes, i) @@ -920,34 +920,63 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_ found: BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot); - if (k.k->p.snapshot != i->inode.bi_snapshot && !is_whiteout) { - struct inode_walker_entry new = *i; + struct printbuf buf = PRINTBUF; + int ret = 0; - new.inode.bi_snapshot = k.k->p.snapshot; - new.count = 0; - new.i_size = 0; - - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, + trans, snapshot_key_missing_inode_snapshot, + "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" "unexpected because we should always update the inode when we update a key in that inode\n" "%s", - w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, buf.buf); - printbuf_exit(&buf); + w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, + (bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + struct bch_inode_unpacked new = i->inode; + struct bkey_i whiteout; + + new.bi_snapshot = k.k->p.snapshot; + + if (!i->whiteout) { + ret = __bch2_fsck_write_inode(trans, &new); + } else { + bkey_init(&whiteout.k); + whiteout.k.type = KEY_TYPE_whiteout; + whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot); + ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &whiteout, + BTREE_UPDATE_internal_snapshot_node); + } + + if (ret) + goto fsck_err; + + ret = bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto fsck_err; + + struct inode_walker_entry new_entry = *i; + + new_entry.inode.bi_snapshot = k.k->p.snapshot; + new_entry.count = 0; + new_entry.i_size = 0; while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) --i; size_t pos = i - w->inodes.data; - int ret = darray_insert_item(&w->inodes, pos, new); + ret = darray_insert_item(&w->inodes, pos, new_entry); if (ret) - return ERR_PTR(ret); + goto fsck_err; - i = w->inodes.data + pos; + ret = -BCH_ERR_transaction_restart_nested; + goto fsck_err; } + printbuf_exit(&buf); return i; +fsck_err: + printbuf_exit(&buf); + return ERR_PTR(ret); } static struct inode_walker_entry *walk_inode(struct btree_trans *trans, @@ -962,7 +991,7 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, w->last_pos = k.k->p; - return lookup_inode_for_snapshot(trans->c, w, k); + return lookup_inode_for_snapshot(trans, w, k); } /* From 3f8e97726557f2130c2992bf214c9e936b4a0877 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 2 May 2025 14:43:45 -0400 Subject: [PATCH 116/218] bcachefs: Skip unrelated snapshot trees in snapshot deletion Don't scan keys in inodes for which the snapshot tree doesn't match any we're deleting from. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 45 ++++++++++++++++++++++++++++++++++-- fs/bcachefs/snapshot_types.h | 1 + 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index aaf64271c041..f133704d50a2 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1432,6 +1432,33 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, return 0; } +static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + + u64 inum = iter->btree_id != BTREE_ID_inodes + ? iter->pos.inode + : iter->pos.offset; + + if (*prev_inum == inum) + return false; + + *prev_inum = inum; + + bool ret = !snapshot_list_has_id(&d->deleting_from_trees, + bch2_snapshot_tree(c, iter->pos.snapshot)); + if (unlikely(ret)) { + struct bpos pos = iter->pos; + pos.snapshot = 0; + if (iter->btree_id != BTREE_ID_inodes) + pos.offset = U64_MAX; + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos)); + } + + return ret; +} + /* * For a given snapshot, if it doesn't have a subvolume that points to it, and * it doesn't have child snapshot nodes - it's now redundant and we can mark it @@ -1459,8 +1486,11 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s !snapshot_list_has_id(&d->delete_leaves, child); } + u32 tree = bch2_snapshot_tree(c, s.k->p.offset); + if (live_children == 0) { - ret = snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); + ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: + snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); } else if (live_children == 1) { struct snapshot_interior_delete n = { .id = s.k->p.offset, @@ -1471,7 +1501,8 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s bch_err(c, "error finding live child of snapshot %u", n.id); ret = -EINVAL; } else { - ret = darray_push(&d->delete_interior, n); + ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: + darray_push(&d->delete_interior, n); } } mutex_unlock(&d->progress_lock); @@ -1554,6 +1585,10 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) { + prt_printf(out, "deleting from trees"); + darray_for_each(d->deleting_from_trees, i) + prt_printf(out, " %u", *i); + prt_printf(out, "deleting leaves"); darray_for_each(d->delete_leaves, i) prt_printf(out, " %u", *i); @@ -1603,6 +1638,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { struct disk_reservation res = { 0 }; + u64 prev_inum = 0; d->pos.pos = POS_MIN; @@ -1614,6 +1650,10 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + delete_dead_snapshots_process_key(trans, &iter, k); })); @@ -1656,6 +1696,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) } err: mutex_lock(&d->progress_lock); + darray_exit(&d->deleting_from_trees); darray_exit(&d->delete_interior); darray_exit(&d->delete_leaves); d->running = false; diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h index 62def3d7e0ed..6a969996e68f 100644 --- a/fs/bcachefs/snapshot_types.h +++ b/fs/bcachefs/snapshot_types.h @@ -15,6 +15,7 @@ struct snapshot_delete { struct work_struct work; struct mutex progress_lock; + snapshot_id_list deleting_from_trees; snapshot_id_list delete_leaves; interior_delete_list delete_interior; From 08d14d90a42a96f409ef3bb1fd073ca0a6bace27 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 2 May 2025 12:33:17 -0400 Subject: [PATCH 117/218] bcachefs: BCH_SNAPSHOT_DELETED -> BCH_SNAPSHOT_WILL_DELETE We're going to be speeding up snapshot deletion, by only having it process the extents/dirents/xattrs btrees if an inode of a given snapshot ID was present. This raises the possibility of 'bkey_in_missing_snapshot' errors popping up, if we ever accidentally don't do the corresponding inode update, or if the new algorithm has bugs. So we'll want to be able to differentiate more definitively between 'snapshot went missing' (and perhaps needs to be reconstructed), and 'key in snapshot that was deleted'. So instead of deleting snapshot IDs, we'll be adding a new deleted flag and leaving them permanently. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 12 ++++++------ fs/bcachefs/snapshot_format.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index f133704d50a2..87c8aead8610 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -213,7 +213,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", BCH_SNAPSHOT_SUBVOL(s.v), - BCH_SNAPSHOT_DELETED(s.v), + BCH_SNAPSHOT_WILL_DELETE(s.v), le32_to_cpu(s.v->parent), le32_to_cpu(s.v->children[0]), le32_to_cpu(s.v->children[1]), @@ -339,7 +339,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, parent - id - 1 < IS_ANCESTOR_BITMAP) __set_bit(parent - id - 1, t->is_ancestor); - if (BCH_SNAPSHOT_DELETED(s.v)) { + if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); @@ -748,7 +748,7 @@ static int check_snapshot(struct btree_trans *trans, } bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && - !BCH_SNAPSHOT_DELETED(&s); + !BCH_SNAPSHOT_WILL_DELETE(&s); if (should_have_subvol) { id = le32_to_cpu(s.subvol); @@ -1062,10 +1062,10 @@ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) } /* already deleted? */ - if (BCH_SNAPSHOT_DELETED(&s->v)) + if (BCH_SNAPSHOT_WILL_DELETE(&s->v)) goto err; - SET_BCH_SNAPSHOT_DELETED(&s->v, true); + SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true); SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); s->v.subvol = 0; err: @@ -1782,7 +1782,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct return 0; struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || + if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || interior_snapshot_needs_delete(snap)) set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h index aabcd3a74cd9..685a9fe209ab 100644 --- a/fs/bcachefs/snapshot_format.h +++ b/fs/bcachefs/snapshot_format.h @@ -15,7 +15,7 @@ struct bch_snapshot { bch_le128 btime; }; -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) +LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) From e9756dd29f33ede1a595d9fb5e0e2586f7542c1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 2 May 2025 12:37:36 -0400 Subject: [PATCH 118/218] bcachefs: bcachefs_metadata_version_snapshot_deletion_v2 We're going to be speeding up snapshot deletion, by only having it process the extents/dirents/xattrs btrees if an inode of a given snapshot ID was present. This raises the possibility of 'bkey_in_missing_snapshot' errors popping up, if we ever accidentally don't do the corresponding inode update, or if the new algorithm has bugs. So instead of deleting snapshot IDs, add a new deleted flag, so that 'key in missing snapshot' errors can more definitively tell what happened and automatically repair. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 +- fs/bcachefs/snapshot.c | 91 ++++++++++++++++++++++++++--------- fs/bcachefs/snapshot.h | 31 +++++++++--- fs/bcachefs/snapshot_format.h | 2 +- fs/bcachefs/snapshot_types.h | 30 ++++++++++++ fs/bcachefs/subvolume_types.h | 27 ----------- 6 files changed, 125 insertions(+), 59 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 7ce475c565b5..0beff6af7ecf 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -695,7 +695,8 @@ struct bch_sb_field_ext { x(stripe_backpointers, BCH_VERSION(1, 22)) \ x(stripe_lru, BCH_VERSION(1, 23)) \ x(casefolding, BCH_VERSION(1, 24)) \ - x(extent_flags, BCH_VERSION(1, 25)) + x(extent_flags, BCH_VERSION(1, 25)) \ + x(snapshot_deletion_v2, BCH_VERSION(1, 26)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 87c8aead8610..a16fa0d8a274 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -211,9 +211,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", - BCH_SNAPSHOT_SUBVOL(s.v), - BCH_SNAPSHOT_WILL_DELETE(s.v), + if (BCH_SNAPSHOT_SUBVOL(s.v)) + prt_str(out, "subvol "); + if (BCH_SNAPSHOT_WILL_DELETE(s.v)) + prt_str(out, "will_delete "); + if (BCH_SNAPSHOT_DELETED(s.v)) + prt_str(out, "deleted "); + + prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u", le32_to_cpu(s.v->parent), le32_to_cpu(s.v->children[0]), le32_to_cpu(s.v->children[1]), @@ -314,7 +319,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - t->live = true; + t->state = !BCH_SNAPSHOT_DELETED(s.v) + ? SNAPSHOT_ID_live + : SNAPSHOT_ID_deleted; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); @@ -711,6 +718,9 @@ static int check_snapshot(struct btree_trans *trans, memset(&s, 0, sizeof(s)); memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); + if (BCH_SNAPSHOT_DELETED(&s)) + return 0; + id = le32_to_cpu(s.parent); if (id) { ret = bch2_snapshot_lookup(trans, id, &v); @@ -998,7 +1008,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) snapshot_id_list_to_text(&buf, t); darray_for_each(*t, id) { - if (fsck_err_on(!bch2_snapshot_exists(c, *id), + if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, trans, snapshot_node_missing, "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { @@ -1023,22 +1033,38 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) return ret; } -int bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +int __bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; + enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); - if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), + /* Snapshot was definitively deleted, this error is marked autofix */ + if (fsck_err_on(state == SNAPSHOT_ID_deleted, + trans, bkey_in_deleted_snapshot, + "key in deleted snapshot %s, delete?", + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; + + /* + * Snapshot missing: we should have caught this with btree_lost_data and + * kicked off reconstruct_snapshots, so if we end up here we have no + * idea what happened: + */ + if (fsck_err_on(state == SNAPSHOT_ID_empty, trans, bkey_in_missing_snapshot, "key in missing snapshot %s, delete?", (bch2_btree_id_to_text(&buf, iter->btree_id), prt_char(&buf, ' '), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; + BTREE_UPDATE_internal_snapshot_node) ?: 1; fsck_err: printbuf_exit(&buf); return ret; @@ -1085,24 +1111,25 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) struct btree_iter iter, p_iter = {}; struct btree_iter c_iter = {}; struct btree_iter tree_iter = {}; - struct bkey_s_c_snapshot s; u32 parent_id, child_id; unsigned i; int ret = 0; - s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_intent, snapshot); - ret = bkey_err(s); + struct bkey_i_snapshot *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_intent, snapshot); + ret = PTR_ERR_OR_ZERO(s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", id); if (ret) goto err; - BUG_ON(s.v->children[1]); + BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); + BUG_ON(s->v.children[1]); - parent_id = le32_to_cpu(s.v->parent); - child_id = le32_to_cpu(s.v->children[0]); + parent_id = le32_to_cpu(s->v.parent); + child_id = le32_to_cpu(s->v.children[0]); if (parent_id) { struct bkey_i_snapshot *parent; @@ -1160,24 +1187,38 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) */ struct bkey_i_snapshot_tree *s_t; - BUG_ON(s.v->children[1]); + BUG_ON(s->v.children[1]); s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), 0, snapshot_tree); ret = PTR_ERR_OR_ZERO(s_t); if (ret) goto err; - if (s.v->children[0]) { - s_t->v.root_snapshot = s.v->children[0]; + if (s->v.children[0]) { + s_t->v.root_snapshot = s->v.children[0]; } else { s_t->k.type = KEY_TYPE_deleted; set_bkey_val_u64s(&s_t->k, 0); } } - ret = bch2_btree_delete_at(trans, &iter, 0); + if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + s->v.parent = 0; + s->v.children[0] = 0; + s->v.children[1] = 0; + s->v.subvol = 0; + s->v.tree = 0; + s->v.depth = 0; + s->v.skip[0] = 0; + s->v.skip[1] = 0; + s->v.skip[2] = 0; + } else { + s->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s->k, 0); + } err: bch2_trans_iter_exit(trans, &tree_iter); bch2_trans_iter_exit(trans, &p_iter); @@ -1478,6 +1519,9 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s if (BCH_SNAPSHOT_SUBVOL(s.v)) return 0; + if (BCH_SNAPSHOT_DELETED(s.v)) + return 0; + mutex_lock(&d->progress_lock); for (unsigned i = 0; i < 2; i++) { u32 child = le32_to_cpu(s.v->children[i]); @@ -1536,6 +1580,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct bkey_i_snapshot *s; int ret; + if (!bch2_snapshot_exists(c, k.k->p.offset)) + return 0; + if (k.k->type != KEY_TYPE_snapshot) return 0; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 24a451bb7024..69c484b77729 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -120,19 +120,24 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) return id; } -static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id) { const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->live : 0; + return s ? s->state : SNAPSHOT_ID_empty; +} + +static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id); + rcu_read_unlock(); + + return ret; } static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) { - rcu_read_lock(); - bool ret = __bch2_snapshot_exists(c, id); - rcu_read_unlock(); - - return ret; + return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; } static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) @@ -241,7 +246,17 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_reconstruct_snapshots(struct bch_fs *); -int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) + ? 0 + : __bch2_check_key_has_snapshot(trans, iter, k); +} int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h index 685a9fe209ab..9bccae1f3590 100644 --- a/fs/bcachefs/snapshot_format.h +++ b/fs/bcachefs/snapshot_format.h @@ -16,9 +16,9 @@ struct bch_snapshot { }; LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) - /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) /* * Snapshot trees: diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h index 6a969996e68f..1aa7a58442ae 100644 --- a/fs/bcachefs/snapshot_types.h +++ b/fs/bcachefs/snapshot_types.h @@ -3,8 +3,38 @@ #define _BCACHEFS_SNAPSHOT_TYPES_H #include "bbpos_types.h" +#include "darray.h" #include "subvolume_types.h" +typedef DARRAY(u32) snapshot_id_list; + +#define IS_ANCESTOR_BITMAP 128 + +struct snapshot_t { + enum snapshot_id_state { + SNAPSHOT_ID_empty, + SNAPSHOT_ID_live, + SNAPSHOT_ID_deleted, + } state; + u32 parent; + u32 skip[3]; + u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; + unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; +}; + +struct snapshot_table { + struct rcu_head rcu; + size_t nr; +#ifndef RUST_BINDGEN + DECLARE_FLEX_ARRAY(struct snapshot_t, s); +#else + struct snapshot_t s[0]; +#endif +}; + struct snapshot_interior_delete { u32 id; u32 live_child; diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 1549d6daf7af..9d634b906dcd 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -2,33 +2,6 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -#include "darray.h" - -typedef DARRAY(u32) snapshot_id_list; - -#define IS_ANCESTOR_BITMAP 128 - -struct snapshot_t { - bool live; - u32 parent; - u32 skip[3]; - u32 depth; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 tree; - unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -}; - -struct snapshot_table { - struct rcu_head rcu; - size_t nr; -#ifndef RUST_BINDGEN - DECLARE_FLEX_ARRAY(struct snapshot_t, s); -#else - struct snapshot_t s[0]; -#endif -}; - typedef struct { /* we can't have padding in this struct: */ u64 subvol; From 88f62ed60ceebf387140c8e59df8db827668d09a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 2 May 2025 13:23:22 -0400 Subject: [PATCH 119/218] bcachefs: delete_dead_snapshot_keys_v2() Since extents, dirents and xattrs require an inode with the corresponding snapshot ID to exists, we can avoid a lot of scanning by only scanning those trees for keys to process if the correspending inode exists. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 164 ++++++++++++++++++++++++++++++++++------- 1 file changed, 136 insertions(+), 28 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index a16fa0d8a274..9ec3275c7b0a 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1432,6 +1432,12 @@ static unsigned live_child(struct bch_fs *c, u32 id) return ret; } +static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) +{ + return snapshot_list_has_id(&d->delete_leaves, id) || + interior_delete_has_id(&d->delete_interior, id) != 0; +} + static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) @@ -1500,6 +1506,129 @@ static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree return ret; } +static int delete_dead_snapshot_keys_v1(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + + for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { + struct disk_reservation res = { 0 }; + u64 prev_inum = 0; + + d->pos.pos = POS_MIN; + + if (!btree_type_has_snapshots(d->pos.btree)) + continue; + + int ret = for_each_btree_key_commit(trans, iter, + d->pos.btree, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + + if (ret) + return ret; + } + + return 0; +} + +static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree, + struct bpos start, struct bpos end) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + struct disk_reservation res = { 0 }; + + d->pos.btree = btree; + d->pos.pos = POS_MIN; + + int ret = for_each_btree_key_max_commit(trans, iter, + btree, start, end, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + delete_dead_snapshots_process_key(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + return ret; +} + +static int delete_dead_snapshot_keys_v2(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + struct disk_reservation res = { 0 }; + u64 prev_inum = 0; + int ret = 0; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + + while (1) { + struct bkey_s_c k; + ret = lockrestart_do(trans, + bkey_err(k = bch2_btree_iter_peek(trans, &iter))); + if (ret) + break; + + if (!k.k) + break; + + d->pos.btree = iter.btree_id; + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + if (snapshot_id_dying(d, k.k->p.snapshot)) { + struct bpos start = POS(k.k->p.offset, 0); + struct bpos end = POS(k.k->p.offset, U64_MAX); + + ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?: + delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?: + delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end); + if (ret) + break; + + bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1)); + } else { + bch2_btree_iter_advance(trans, &iter); + } + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + goto err; + + prev_inum = 0; + ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.btree = iter.btree_id; + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); +err: + bch2_disk_reservation_put(c, &res); + return ret; +} + /* * For a given snapshot, if it doesn't have a subvolume that points to it, and * it doesn't have child snapshot nodes - it's now redundant and we can mark it @@ -1683,34 +1812,13 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } - for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { - struct disk_reservation res = { 0 }; - u64 prev_inum = 0; - - d->pos.pos = POS_MIN; - - if (!btree_type_has_snapshots(d->pos.btree)) - continue; - - ret = for_each_btree_key_commit(trans, iter, - d->pos.btree, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - delete_dead_snapshots_process_key(trans, &iter, k); - })); - - bch2_disk_reservation_put(c, &res); - - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting keys from dying snapshots"); - if (ret) - goto err; - } + ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2) + ? delete_dead_snapshot_keys_v2(trans) + : delete_dead_snapshot_keys_v1(trans); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + if (ret) + goto err; darray_for_each(d->delete_leaves, i) { ret = commit_do(trans, NULL, NULL, 0, From 7d4f2687ef8a625742c5df4e2d42f50ba398f3a2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 19:14:54 -0400 Subject: [PATCH 120/218] bcachefs: bch2_journal_write() refactoring Make the locking easier to follow; also take io_refs earlier, in __journal_write_alloc(). Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 110 ++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 64 deletions(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index be86fd21de2a..c593d77dc8f2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1467,6 +1467,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, { struct bch_fs *c = container_of(j, struct bch_fs, journal); + rcu_read_lock(); darray_for_each(*devs, i) { struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) @@ -1488,6 +1489,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); } } + rcu_read_unlock(); } static void __journal_write_alloc(struct journal *j, @@ -1500,7 +1502,8 @@ static void __journal_write_alloc(struct journal *j, struct bch_fs *c = container_of(j, struct bch_fs, journal); darray_for_each(*devs, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); + struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, + BCH_DEV_WRITE_REF_journal_write); if (!ca) continue; @@ -1514,8 +1517,10 @@ static void __journal_write_alloc(struct journal *j, ca->mi.state != BCH_MEMBER_STATE_rw || !ja->nr || bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || - sectors > ja->sectors_free) + sectors > ja->sectors_free) { + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); continue; + } bch2_dev_stripe_increment(ca, &j->wp.stripe); @@ -1538,15 +1543,8 @@ static void __journal_write_alloc(struct journal *j, } } -/** - * journal_write_alloc - decide where to write next journal entry - * - * @j: journal object - * @w: journal buf (entry to be written) - * - * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure - */ -static int journal_write_alloc(struct journal *j, struct journal_buf *w) +static int journal_write_alloc(struct journal *j, struct journal_buf *w, + unsigned *replicas) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; @@ -1554,29 +1552,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; - unsigned replicas = 0, replicas_want = - READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); bool advance_done = false; - rcu_read_lock(); - - /* We might run more than once if we have to stop and do discards: */ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); - bkey_for_each_ptr(ptrs, p) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); - if (ca) - replicas += ca->mi.durability; - } - retry_target: devs = target_rw_devs(c, BCH_DATA_journal, target); devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); retry_alloc: - __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); + __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); - if (likely(replicas >= replicas_want)) + if (likely(*replicas >= replicas_want)) goto done; if (!advance_done) { @@ -1585,18 +1572,16 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) goto retry_alloc; } - if (replicas < replicas_want && target) { + if (*replicas < replicas_want && target) { /* Retry from all devices: */ target = 0; advance_done = false; goto retry_target; } done: - rcu_read_unlock(); - BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; + return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -1782,13 +1767,7 @@ static CLOSURE_CALLBACK(journal_write_submit) unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, - BCH_DEV_WRITE_REF_journal_write); - if (!ca) { - /* XXX: fix this */ - bch_err(c, "missing device %u for journal write", ptr->dev); - continue; - } + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); @@ -2074,7 +2053,8 @@ CLOSURE_CALLBACK(bch2_journal_write) ret = bch2_journal_write_pick_flush(j, w); spin_unlock(&j->lock); - if (ret) + + if (unlikely(ret)) goto err; mutex_lock(&j->buf_lock); @@ -2082,43 +2062,30 @@ CLOSURE_CALLBACK(bch2_journal_write) ret = bch2_journal_write_prep(j, w); mutex_unlock(&j->buf_lock); - if (ret) + + if (unlikely(ret)) goto err; - j->entry_bytes_written += vstruct_bytes(w->data); - + unsigned replicas_allocated = 0; while (1) { - spin_lock(&j->lock); - ret = journal_write_alloc(j, w); + ret = journal_write_alloc(j, w, &replicas_allocated); if (!ret || !j->can_discard) break; - spin_unlock(&j->lock); bch2_journal_do_discards(j); } - if (ret && !bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), - le64_to_cpu(w->data->seq), - vstruct_sectors(w->data, c->block_bits), - bch2_err_str(ret)); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - if (ret) - goto err; + if (unlikely(ret)) + goto err_allocate_write; + spin_lock(&j->lock); /* * write is allocated, no longer need to account for it in * bch2_journal_space_available(): */ w->sectors = 0; w->write_allocated = true; + j->entry_bytes_written += vstruct_bytes(w->data); /* * journal entry has been compacted and allocated, recalculate space @@ -2130,9 +2097,6 @@ CLOSURE_CALLBACK(bch2_journal_write) w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - if (c->opts.nochanges) - goto no_io; - /* * Mark journal replicas before we submit the write to guarantee * recovery will find the journal entries after a crash. @@ -2143,15 +2107,33 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + if (c->opts.nochanges) + goto no_io; + if (!JSET_NO_FLUSH(w->data)) continue_at(cl, journal_write_preflush, j->wq); else continue_at(cl, journal_write_submit, j->wq); return; -no_io: - continue_at(cl, journal_write_done, j->wq); - return; +err_allocate_write: + if (!bch2_journal_error(j)) { + struct printbuf buf = PRINTBUF; + + bch2_journal_debug_to_text(&buf, j); + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), + le64_to_cpu(w->data->seq), + vstruct_sectors(w->data, c->block_bits), + bch2_err_str(ret)); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } err: bch2_fatal_error(c); +no_io: + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); + } + continue_at(cl, journal_write_done, j->wq); } From e02888faab24494f91016d578e3dc9dce81e3d71 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 May 2025 14:45:54 -0400 Subject: [PATCH 121/218] bcachefs: bch2_dev_in_target() no longer takes rcu_read_lock() Minor optimization, the caller generally has it already. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 6 +----- fs/bcachefs/rebalance.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index c1a2a957c884..c20ecf5e5381 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -212,17 +212,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) case TARGET_DEV: return dev == t.dev; case TARGET_GROUP: { - rcu_read_lock(); struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); const struct bch_devs_mask *m = g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - bool ret = m ? test_bit(dev, m->d) : false; - rcu_read_unlock(); - - return ret; + return m ? test_bit(dev, m->d) : false; } default: BUG(); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 26c87ab019e8..7bcebcac2e1a 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -80,11 +80,13 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, unsigned ptr_bit = 1; unsigned rewrite_ptrs = 0; + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) { if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) rewrite_ptrs |= ptr_bit; ptr_bit <<= 1; } + rcu_read_unlock(); return rewrite_ptrs; } @@ -132,10 +134,14 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } } incompressible: - if (opts->background_target) + if (opts->background_target) { + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) + if (!p.ptr.cached && + !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; + rcu_read_unlock(); + } return sectors; } From 84bd6afee121b9e9bcc26f88cb55e0ee5c7a8f56 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 May 2025 16:24:43 -0400 Subject: [PATCH 122/218] bcachefs: inline bch2_ob_ptr() This was an oversight, we want bch2_alloc_sectors_append_ptrs_inlined() fully inlined. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 14 -------------- fs/bcachefs/alloc_foreground.h | 15 ++++++++++++++- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 2d7f32f9499e..b50846da7ae4 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1388,20 +1388,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, return ret; } -struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - return (struct bch_extent_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = ob->gen, - .dev = ob->dev, - .offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free, - }; -} - void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, struct bkey_i *k, unsigned sectors, bool cached) diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 192203410d4e..2e01c7b61ed1 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -3,6 +3,7 @@ #define _BCACHEFS_ALLOC_FOREGROUND_H #include "bcachefs.h" +#include "buckets.h" #include "alloc_types.h" #include "extents.h" #include "io_write_types.h" @@ -233,7 +234,19 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *, struct closure *, struct write_point **); -struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); +static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = ob_dev(c, ob); + + return (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = ob->gen, + .dev = ob->dev, + .offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free, + }; +} /* * Append pointers to the space we just allocated to @k, and mark @sectors space From fbe728f9569b683564421a9190be53e60111a864 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 May 2025 21:15:34 -0400 Subject: [PATCH 123/218] bcachefs: improve check_inode_hash_info_matches_root() error message Signed-off-by: Kent Overstreet --- fs/bcachefs/str_hash.c | 31 +++++++++++++++++++------------ fs/bcachefs/str_hash.h | 8 +++++--- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index a90bf7b8a2b4..55a3a116b5a8 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -157,6 +157,8 @@ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans if (bkey_is_inode(k.k)) goto found; } + + /* This would've been caught by check_key_has_inode() */ bch_err(c, "%s(): inum %llu not found", __func__, inum); ret = -BCH_ERR_fsck_repair_unimplemented; goto err; @@ -166,20 +168,25 @@ found:; if (ret) goto err; - struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); - if (hash_info->type != hash2.type || - memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { + struct bch_hash_info hash_root = bch2_hash_info_init(c, &inode); + if (hash_info->type != hash_root.type || + memcmp(&hash_info->siphash_key, + &hash_root.siphash_key, + sizeof(hash_root.siphash_key))) { ret = repair_inode_hash_info(trans, &inode); if (!ret) { - bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" - "%u %llx %llx\n" - "%u %llx %llx", - hash_info->type, - hash_info->siphash_key.k0, - hash_info->siphash_key.k1, - hash2.type, - hash2.siphash_key.k0, - hash2.siphash_key.k1); + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", inum); + + prt_printf(&buf, "root snapshot %u ", hash_root.inum_snapshot); + bch2_prt_str_hash_type(&buf, hash_root.type); + prt_printf(&buf, " %llx %llx\n", hash_root.siphash_key.k0, hash_root.siphash_key.k1); + + prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); + bch2_prt_str_hash_type(&buf, hash_info->type); + prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_fsck_repair_unimplemented; } } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 0c1a00539bd1..ae3154fb6a94 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -32,6 +32,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) } struct bch_hash_info { + u32 inum_snapshot; u8 type; struct unicode_map *cf_encoding; /* @@ -45,11 +46,12 @@ static inline struct bch_hash_info bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) { struct bch_hash_info info = { - .type = INODE_STR_HASH(bi), + .inum_snapshot = bi->bi_snapshot, + .type = INODE_STR_HASH(bi), #ifdef CONFIG_UNICODE - .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, + .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, #endif - .siphash_key = { .k0 = bi->bi_hash_seed } + .siphash_key = { .k0 = bi->bi_hash_seed } }; if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { From 39430cfd27ed2e1243374ea28479773506e119c3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 May 2025 15:02:53 -0400 Subject: [PATCH 124/218] bcachefs: Improve bch2_extent_ptr_set_cached() Preferentially keep existing cached pointers instead of adding new ones. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 55 ++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index c4fe4ffd41f1..d3af841e48ef 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1136,33 +1136,50 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c, struct bkey_s k, struct bch_extent_ptr *ptr) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + struct bkey_ptrs ptrs; union bch_extent_entry *entry; struct extent_ptr_decoded p; + bool have_cached_ptr; + unsigned drop_dev = ptr->dev; rcu_read_lock(); - if (!want_cached_ptr(c, opts, ptr)) { - bch2_bkey_drop_ptr_noerror(k, ptr); - goto out; +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(k); + have_cached_ptr = false; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Check if it's erasure coded - stripes can't contain cached + * data. Possibly something we can fix in the future? + */ + if (&entry->ptr == ptr && p.has_ec) + goto drop; + + if (p.ptr.cached) { + if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { + bch2_bkey_drop_ptr_noerror(k, &entry->ptr); + ptr = NULL; + goto restart_drop_ptrs; + } + + have_cached_ptr = true; + } } - /* - * Stripes can't contain cached data, for - reasons. - * - * Possibly something we can fix in the future? - */ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (&entry->ptr == ptr) { - if (p.has_ec) - bch2_bkey_drop_ptr_noerror(k, ptr); - else - ptr->cached = true; - goto out; - } + if (!ptr) + bkey_for_each_ptr(ptrs, ptr2) + if (ptr2->dev == drop_dev) + ptr = ptr2; - BUG(); -out: + if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) + goto drop; + + ptr->cached = true; rcu_read_unlock(); + return; +drop: + rcu_read_unlock(); + bch2_bkey_drop_ptr_noerror(k, ptr); } /* From 502222041c810b5d5ba5d45512e0a131c7f07d0a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 May 2025 21:19:17 -0400 Subject: [PATCH 125/218] bcachefs: __bch2_fs_free() cleanup Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 45e2b2bc8c65..7c6ea43b4347 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -579,35 +579,36 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); bch2_free_fsck_errs(c); - bch2_fs_accounting_exit(c); - bch2_fs_async_obj_exit(c); - bch2_fs_sb_errors_exit(c); - bch2_fs_counters_exit(c); + bch2_fs_vfs_exit(c); bch2_fs_snapshots_exit(c); + bch2_fs_sb_errors_exit(c); + bch2_fs_replicas_exit(c); bch2_fs_quota_exit(c); + bch2_fs_nocow_locking_exit(c); + bch2_fs_journal_exit(&c->journal); bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); - bch2_fs_vfs_exit(c); - bch2_fs_ec_exit(c); - bch2_fs_encryption_exit(c); - bch2_fs_nocow_locking_exit(c); bch2_fs_io_write_exit(c); bch2_fs_io_read_exit(c); - bch2_fs_buckets_waiting_for_journal_exit(c); - bch2_fs_btree_interior_update_exit(c); - bch2_fs_btree_key_cache_exit(&c->btree_key_cache); - bch2_fs_btree_cache_exit(c); - bch2_fs_btree_iter_exit(c); - bch2_fs_replicas_exit(c); - bch2_fs_journal_exit(&c->journal); + bch2_fs_encryption_exit(c); + bch2_fs_ec_exit(c); + bch2_fs_counters_exit(c); + bch2_fs_compress_exit(c); bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); - bch2_fs_compress_exit(c); + bch2_fs_buckets_waiting_for_journal_exit(c); + bch2_fs_btree_write_buffer_exit(c); + bch2_fs_btree_key_cache_exit(&c->btree_key_cache); + bch2_fs_btree_iter_exit(c); + bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_cache_exit(c); + bch2_fs_accounting_exit(c); + bch2_fs_async_obj_exit(c); bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); + BUG_ON(atomic_read(&c->journal_keys.ref)); - bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); if (c->online_reserved) { u64 v = percpu_u64_get(c->online_reserved); From 96fc7d8adb7881f7ffffcd6ab2be3b43fc5a5978 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 May 2025 20:35:36 -0400 Subject: [PATCH 126/218] bcachefs: opts.rebalance_on_ac_only Add an option for setting rebalance to only run when connected to mains power. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 1 + fs/bcachefs/opts.h | 5 ++++ fs/bcachefs/rebalance.c | 48 ++++++++++++++++++++++++++++++++--- fs/bcachefs/rebalance.h | 4 ++- fs/bcachefs/rebalance_types.h | 5 ++++ fs/bcachefs/super.c | 3 ++- 6 files changed, 60 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0beff6af7ecf..061fa2666f35 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -870,6 +870,7 @@ LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); +LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index b8cd0b04e62a..f4c014ad43c1 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -490,6 +490,11 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, true, \ NULL, "Enable rebalance: disable for debugging, or to\n"\ "quiet the system when doing performance testing\n")\ + x(rebalance_on_ac_only, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_REBALANCE_AC_ONLY, false, \ + NULL, "Enable rebalance while on mains power only\n") \ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 7bcebcac2e1a..8fefe2b174c2 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -518,6 +518,13 @@ static void rebalance_wait(struct bch_fs *c) bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); } +static bool bch2_rebalance_enabled(struct bch_fs *c) +{ + return c->opts.rebalance_enabled && + !(c->opts.rebalance_on_ac_only && + c->rebalance.on_battery); +} + static int do_rebalance(struct moving_context *ctxt) { struct btree_trans *trans = ctxt->trans; @@ -537,9 +544,9 @@ static int do_rebalance(struct moving_context *ctxt) BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { - if (!c->opts.rebalance_enabled) { + if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(c->opts.rebalance_enabled || + kthread_wait_freezable(bch2_rebalance_enabled(c) || kthread_should_stop()); } @@ -714,9 +721,42 @@ int bch2_rebalance_start(struct bch_fs *c) return 0; } -void bch2_fs_rebalance_init(struct bch_fs *c) +#ifdef CONFIG_POWER_SUPPLY +#include + +static int bch2_rebalance_power_notifier(struct notifier_block *nb, + unsigned long event, void *data) { - bch2_pd_controller_init(&c->rebalance.pd); + struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); + + c->rebalance.on_battery = !power_supply_is_system_supplied(); + bch2_rebalance_wakeup(c); + return NOTIFY_OK; +} +#endif + +void bch2_fs_rebalance_exit(struct bch_fs *c) +{ +#ifdef CONFIG_POWER_SUPPLY + power_supply_unreg_notifier(&c->rebalance.power_notifier); +#endif +} + +int bch2_fs_rebalance_init(struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; + + bch2_pd_controller_init(&r->pd); + +#ifdef CONFIG_POWER_SUPPLY + r->power_notifier.notifier_call = bch2_rebalance_power_notifier; + int ret = power_supply_reg_notifier(&r->power_notifier); + if (ret) + return ret; + + r->on_battery = !power_supply_is_system_supplied(); +#endif + return 0; } static int check_rebalance_work_one(struct btree_trans *trans, diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index b7c8c0652ad6..5d9214fe1a22 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -52,7 +52,9 @@ void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); -void bch2_fs_rebalance_init(struct bch_fs *); + +void bch2_fs_rebalance_exit(struct bch_fs *); +int bch2_fs_rebalance_init(struct bch_fs *); int bch2_check_rebalance_work(struct bch_fs *); diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index fe5098c17dfc..33d77286f1d5 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -30,6 +30,11 @@ struct bch_fs_rebalance { struct bbpos scan_start; struct bbpos scan_end; struct bch_move_stats scan_stats; + + bool on_battery; +#ifdef CONFIG_POWER_SUPPLY + struct notifier_block power_notifier; +#endif }; #endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 7c6ea43b4347..bd0565b7a9ba 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -583,6 +583,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_snapshots_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_replicas_exit(c); + bch2_fs_rebalance_exit(c); bch2_fs_quota_exit(c); bch2_fs_nocow_locking_exit(c); bch2_fs_journal_exit(&c->journal); @@ -867,7 +868,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_fs_move_init(c); bch2_fs_nocow_locking_init_early(c); bch2_fs_quota_init(c); - bch2_fs_rebalance_init(c); bch2_fs_sb_errors_init_early(c); bch2_fs_snapshots_init_early(c); bch2_fs_subvolumes_init_early(c); @@ -989,6 +989,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_fs_fsio_init(c) ?: bch2_fs_fs_io_direct_init(c) ?: bch2_fs_io_read_init(c) ?: + bch2_fs_rebalance_init(c) ?: bch2_fs_sb_errors_init(c) ?: bch2_fs_vfs_init(c); if (ret) From 66e9a7f13916fb0630cc48e0c21c474607aa3967 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 May 2025 00:51:39 -0400 Subject: [PATCH 127/218] bcachefs: bch2_dev_remove_stripes() respects degraded flags Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 3 +- fs/bcachefs/ec.c | 77 ++++++++++++++++++++++++---------- fs/bcachefs/ec.h | 4 +- fs/bcachefs/super.c | 3 +- 4 files changed, 62 insertions(+), 25 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 002e3853f8cf..81e2ae4bb400 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2442,8 +2442,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * We clear the LRU and need_discard btrees first so that we don't race * with bch2_do_invalidates() and bch2_do_discards() */ - ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?: - bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, BTREE_TRIGGER_norun, NULL) ?: diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index dcd4e2266d34..bf5f4f6283a4 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -2106,23 +2106,17 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, /* device removal */ -static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) +int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + unsigned dev_idx, + unsigned flags) { - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); - - if (!a->stripe) + if (k.k->type != KEY_TYPE_stripe) return 0; - if (a->stripe_sectors) { - bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); - return -BCH_ERR_invalidate_stripe_to_dev; - } - - struct btree_iter iter; struct bkey_i_stripe *s = - bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), - BTREE_ITER_slots, stripe); + bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); int ret = PTR_ERR_OR_ZERO(s); if (ret) return ret; @@ -2139,35 +2133,76 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ acc.replicas.data_type = BCH_DATA_user; ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); if (ret) - goto err; + return ret; struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == k_a.k->p.inode) + + /* XXX: how much redundancy do we still have? check degraded flags */ + + unsigned nr_good = 0; + + rcu_read_lock(); + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev_idx) ptr->dev = BCH_SB_MEMBER_INVALID; + struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev); + nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; + } + rcu_read_unlock(); + + if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + return -BCH_ERR_remove_would_lose_data; + + unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; + + if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) + return -BCH_ERR_remove_would_lose_data; + sectors = -sectors; memset(&acc, 0, sizeof(acc)); acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = BCH_DATA_user; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); +} + +static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, + unsigned flags) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); + + if (!a->stripe) + return 0; + + if (a->stripe_sectors) { + bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); + return -BCH_ERR_invalidate_stripe_to_dev; + } + + struct btree_iter iter; + struct bkey_s_c_stripe s = + bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), + BTREE_ITER_slots, stripe); + int ret = bkey_err(s); if (ret) - goto err; -err: + return ret; + + ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) +int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) { return bch2_trans_run(c, for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ - bch2_invalidate_stripe_to_dev(trans, k); + bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); }))); } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 83d37bcb548a..548048adf0d5 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -288,7 +288,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, } } -int bch2_dev_remove_stripes(struct bch_fs *, unsigned); +int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, + struct bkey_s_c, unsigned, unsigned); +int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_fs_ec_stop(struct bch_fs *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index bd0565b7a9ba..faa012107a97 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1744,7 +1744,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); - ret = bch2_dev_data_drop(c, ca->dev_idx, flags); + ret = bch2_dev_data_drop(c, ca->dev_idx, flags) ?: + bch2_dev_remove_stripes(c, ca->dev_idx, flags); bch_err_msg(ca, ret, "bch2_dev_data_drop()"); if (ret) goto err; From b3f80d09236e4915d7deedfaf15d7bdaef6f14d2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 21:55:26 -0400 Subject: [PATCH 128/218] bcachefs: BCH_SB_MEMBER_DELETED_UUID Add a sentinal value for devices that have been removed, but don't want to reuse their index until a fsck has completed. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 4 ++++ fs/bcachefs/sb-members.c | 29 ++++++++++++++++++++++++++++- fs/bcachefs/sb-members.h | 4 +++- fs/bcachefs/sb-members_format.h | 4 ++++ 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 92ae31737a24..dd08ec080313 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1079,6 +1079,10 @@ int bch2_check_allocations(struct bch_fs *c) * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); + + if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) + bch2_sb_members_clean_deleted(c); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index f6a0b3de6bca..b9568a68fbf6 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -525,6 +525,7 @@ int bch2_sb_member_alloc(struct bch_fs *c) unsigned u64s; int best = -1; u64 best_last_mount = 0; + unsigned nr_deleted = 0; if (dev_idx < BCH_SB_MEMBERS_MAX) goto have_slot; @@ -535,7 +536,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) continue; struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - if (bch2_member_alive(&m)) + + nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); + + if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) continue; u64 last_mount = le64_to_cpu(m.last_mount); @@ -549,6 +553,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) goto have_slot; } + if (nr_deleted) + bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", + nr_deleted); + return -BCH_ERR_ENOSPC_sb_members; have_slot: nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); @@ -564,3 +572,22 @@ int bch2_sb_member_alloc(struct bch_fs *c) c->disk_sb.sb->nr_devices = nr_devices; return dev_idx; } + +void bch2_sb_members_clean_deleted(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + bool write_sb = false; + + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); + + if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { + memset(&m->uuid, 0, sizeof(m->uuid)); + write_sb = true; + } + } + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index c9cb8f7657b0..6bd9b86aee5b 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -320,7 +320,8 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; static inline bool bch2_member_alive(struct bch_member *m) { - return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); + return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) && + !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID); } static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) @@ -381,5 +382,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); int bch2_sb_member_alloc(struct bch_fs *); +void bch2_sb_members_clean_deleted(struct bch_fs *); #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 472218a59102..fb72ad730518 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -13,6 +13,10 @@ */ #define BCH_SB_MEMBER_INVALID 255 +#define BCH_SB_MEMBER_DELETED_UUID \ + UUID_INIT(0xffffffff, 0xffff, 0xffff, \ + 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) + #define BCH_MIN_NR_NBUCKETS (1 << 6) #define BCH_IOPS_MEASUREMENTS() \ From 09fa6c3039d8bb22351ad071ea4656dd4f331d18 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 21:36:23 -0400 Subject: [PATCH 129/218] bcachefs: bch2_dev_data_drop_by_backpointers() Currently, device removal has to scan all metadata for pointers to the device being removed. Add a new method, with the same interface as bch2_dev_data_drop(), that scans by backpointers instead - this will drastically speed up device removal. Signed-off-by: Kent Overstreet --- fs/bcachefs/migrate.c | 117 +++++++++++++++++++++++++++++++++++++----- fs/bcachefs/migrate.h | 3 +- 2 files changed, 105 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 90dcf80bd64a..bb7a92270c09 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -4,10 +4,13 @@ */ #include "bcachefs.h" +#include "backpointers.h" #include "bkey_buf.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" +#include "ec.h" #include "errcode.h" #include "extents.h" #include "io_write.h" @@ -20,7 +23,7 @@ #include "super-io.h" static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, - unsigned dev_idx, int flags, bool metadata) + unsigned dev_idx, unsigned flags, bool metadata) { unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; @@ -37,11 +40,28 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, return 0; } +static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b, unsigned dev_idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_buf k; + + bch2_bkey_buf_init(&k); + bch2_bkey_buf_copy(&k, c, &b->key); + + int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: + bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); + + bch_err_fn(c, ret); + bch2_bkey_buf_exit(&k, c); + return ret; +} + static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, unsigned dev_idx, - int flags) + unsigned flags) { struct bch_fs *c = trans->c; struct bkey_i *n; @@ -77,9 +97,27 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, return 0; } +static int bch2_dev_btree_drop_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + unsigned dev_idx, + struct bkey_buf *last_flushed, + unsigned flags) +{ + struct btree_iter iter; + struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; + + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int bch2_dev_usrdata_drop(struct bch_fs *c, struct progress_indicator_state *progress, - unsigned dev_idx, int flags) + unsigned dev_idx, unsigned flags) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id id; @@ -106,7 +144,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, static int bch2_dev_metadata_drop(struct bch_fs *c, struct progress_indicator_state *progress, - unsigned dev_idx, int flags) + unsigned dev_idx, unsigned flags) { struct btree_trans *trans; struct btree_iter iter; @@ -137,20 +175,12 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; - bch2_bkey_buf_copy(&k, c, &b->key); - - ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), - dev_idx, flags, true); - if (ret) - break; - - ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } - bch_err_msg(c, ret, "updating btree node key"); if (ret) break; next: @@ -176,7 +206,66 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, return ret; } -int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, + struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, + unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, + last_flushed); + int ret = bkey_err(k); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + return 0; + if (ret) + return ret; + + if (!k.k || !bch2_bkey_has_device_c(k, dev_idx)) + goto out; + + /* + * XXX: pass flags arg to invalidate_stripe_to_dev and handle it + * properly + */ + + if (bkey_is_btree_ptr(k.k)) + ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); + else if (k.k->type == KEY_TYPE_stripe) + ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); + else + ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) +{ + struct btree_trans *trans = bch2_trans_get(c); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, + POS(dev_idx, 0), + POS(dev_idx, U64_MAX), 0, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + if (k.k->type != KEY_TYPE_backpointer) + continue; + + data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), + &last_flushed, flags); + + })); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; +} + +int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) { struct progress_indicator_state progress; bch2_progress_init(&progress, c, diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h index 027efaa0d575..30018140711b 100644 --- a/fs/bcachefs/migrate.h +++ b/fs/bcachefs/migrate.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_MIGRATE_H #define _BCACHEFS_MIGRATE_H -int bch2_dev_data_drop(struct bch_fs *, unsigned, int); +int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); +int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); #endif /* _BCACHEFS_MIGRATE_H */ From a8539ad8fa88e39c3f566b7c35029f25ab90b72e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 3 May 2025 21:26:04 -0400 Subject: [PATCH 130/218] bcachefs: bcachefs_metadata_version_fast_device_removal Fast device removal, that uses backpointers to find pointers to the device being removed instead of a full metadata scan. This requires BCH_SB_MEMBER_DELETED_UUID, which is an incompatible change - hence the version number bump. We don't fully trust backpointers, so we don't want to reuse device indexes until after a fsck has verified that there aren't any pointers to removed devices. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/ec.c | 4 +++- fs/bcachefs/super.c | 27 +++++++++++++++++++++++---- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 061fa2666f35..a483d440fa39 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -696,7 +696,8 @@ struct bch_sb_field_ext { x(stripe_lru, BCH_VERSION(1, 23)) \ x(casefolding, BCH_VERSION(1, 24)) \ x(extent_flags, BCH_VERSION(1, 25)) \ - x(snapshot_deletion_v2, BCH_VERSION(1, 26)) + x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ + x(fast_device_removal, BCH_VERSION(1, 27)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index bf5f4f6283a4..c581426e3894 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -2197,13 +2197,15 @@ static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, s int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) { - return bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); }))); + bch_err_fn(c, ret); + return ret; } /* startup/shutdown */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index faa012107a97..dc8189f9d2f1 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1726,6 +1726,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_member *m; unsigned dev_idx = ca->dev_idx, data; + bool fast_device_removal = !bch2_request_incompat_feature(c, + bcachefs_metadata_version_fast_device_removal); int ret; down_write(&c->state_lock); @@ -1744,12 +1746,25 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); - ret = bch2_dev_data_drop(c, ca->dev_idx, flags) ?: - bch2_dev_remove_stripes(c, ca->dev_idx, flags); - bch_err_msg(ca, ret, "bch2_dev_data_drop()"); + ret = fast_device_removal + ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) + : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: + bch2_dev_remove_stripes(c, ca->dev_idx, flags)); if (ret) goto err; + /* Check if device still has data before blowing away alloc info */ + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + for (unsigned i = 0; i < BCH_DATA_NR; i++) + if (!data_type_is_empty(i) && + !data_type_is_hidden(i) && + usage.buckets[i]) { + bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", + __bch2_data_types[i], usage.buckets[i]); + ret = -EBUSY; + goto err; + } + ret = bch2_dev_remove_alloc(c, ca); bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) @@ -1813,7 +1828,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) */ mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); - memset(&m->uuid, 0, sizeof(m->uuid)); + + if (fast_device_removal) + m->uuid = BCH_SB_MEMBER_DELETED_UUID; + else + memset(&m->uuid, 0, sizeof(m->uuid)); bch2_write_super(c); From 8c69e2b52ea81b102ace48debd114467199ca77a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 May 2025 14:26:18 -0400 Subject: [PATCH 131/218] bcachefs: Knob for manual snapshot deletion Add 'opts.snapshot_deletion_enabled', enabled by default. This may be turned off so that the new sysfs knob, 'internal/trigger_delete_dead_snapshots', may be used instead - this will allow snapshot deletion to be profiled more easily. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.h | 6 ++++++ fs/bcachefs/snapshot.c | 28 +++++++++++++++++++++++----- fs/bcachefs/snapshot.h | 1 + fs/bcachefs/snapshot_types.h | 1 + fs/bcachefs/sysfs.c | 5 +++++ 5 files changed, 36 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index f4c014ad43c1..2a02606254b3 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -495,6 +495,12 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_REBALANCE_AC_ONLY, false, \ NULL, "Enable rebalance while on mains power only\n") \ + x(auto_snapshot_deletion, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 9ec3275c7b0a..c3dc450cbcec 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1776,15 +1776,19 @@ static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snap prt_newline(out); } -int bch2_delete_dead_snapshots(struct bch_fs *c) +int __bch2_delete_dead_snapshots(struct bch_fs *c) { - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) - return 0; - - struct btree_trans *trans = bch2_trans_get(c); struct snapshot_delete *d = &c->snapshot_delete; int ret = 0; + if (!mutex_trylock(&d->lock)) + return 0; + + if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) + goto out_unlock; + + struct btree_trans *trans = bch2_trans_get(c); + /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: @@ -1857,11 +1861,21 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) d->running = false; mutex_unlock(&d->progress_lock); bch2_trans_put(trans); +out_unlock: + mutex_unlock(&d->lock); if (!bch2_err_matches(ret, EROFS)) bch_err_fn(c, ret); return ret; } +int bch2_delete_dead_snapshots(struct bch_fs *c) +{ + if (!c->opts.auto_snapshot_deletion) + return 0; + + return __bch2_delete_dead_snapshots(c); +} + void bch2_delete_dead_snapshots_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); @@ -1874,6 +1888,9 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) void bch2_delete_dead_snapshots_async(struct bch_fs *c) { + if (!c->opts.auto_snapshot_deletion) + return; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) return; @@ -1977,6 +1994,7 @@ void bch2_fs_snapshots_exit(struct bch_fs *c) void bch2_fs_snapshots_init_early(struct bch_fs *c) { INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); + mutex_init(&c->snapshot_delete.lock); mutex_init(&c->snapshot_delete.progress_lock); mutex_init(&c->snapshots_unlinked_lock); } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 69c484b77729..63b9469eb1eb 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -273,6 +273,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } +int __bch2_delete_dead_snapshots(struct bch_fs *); int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_work(struct work_struct *); void bch2_delete_dead_snapshots_async(struct bch_fs *); diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h index 1aa7a58442ae..0ab698f13e5c 100644 --- a/fs/bcachefs/snapshot_types.h +++ b/fs/bcachefs/snapshot_types.h @@ -42,6 +42,7 @@ struct snapshot_interior_delete { typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; struct snapshot_delete { + struct mutex lock; struct work_struct work; struct mutex progress_lock; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index adf99a805a62..4c7d609d79fd 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -149,6 +149,7 @@ write_attribute(trigger_btree_key_cache_shrink); write_attribute(trigger_btree_updates); write_attribute(trigger_freelist_wakeup); write_attribute(trigger_recalc_capacity); +write_attribute(trigger_delete_dead_snapshots); read_attribute(gc_gens_pos); read_attribute(uuid); @@ -439,6 +440,9 @@ STORE(bch2_fs) up_read(&c->state_lock); } + if (attr == &sysfs_trigger_delete_dead_snapshots) + __bch2_delete_dead_snapshots(c); + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -568,6 +572,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_btree_updates, &sysfs_trigger_freelist_wakeup, &sysfs_trigger_recalc_capacity, + &sysfs_trigger_delete_dead_snapshots, &sysfs_gc_gens_pos, From 970dde8271b607876bfdbb66b941ffda35e74973 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 May 2025 16:34:35 -0400 Subject: [PATCH 132/218] bcachefs: Add missing include fix debug build in userspace Signed-off-by: Kent Overstreet --- fs/bcachefs/util.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 14cb2c7dfda4..25cf61ebd40c 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include From 1dfa01ef24151547a2a622e90ad73b082b1bc739 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 May 2025 14:24:12 -0400 Subject: [PATCH 133/218] bcachefs: bch2_copygc_dev_wait_amount() Factor out the per-device calculations, for better introspection. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 2 +- fs/bcachefs/movinggc.c | 56 +++++++++++++++++++++------------- fs/bcachefs/movinggc.h | 2 +- 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index b50846da7ae4..828cf94217dd 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -465,7 +465,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, prt_printf(&buf, "blocking\t%u\n", cl != NULL); prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); - prt_printf(&buf, "copygc_wait\t%lu/%lli\n", + prt_printf(&buf, "copygc_wait\t%llu/%lli\n", bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index e97e87ebe312..6e5680a3a97c 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -261,6 +261,25 @@ static int bch2_copygc(struct moving_context *ctxt, return ret; } +static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) +{ + struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); + struct bch_dev_usage usage; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + usage.buckets[i] = usage_full.d[i].buckets; + + s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * + ca->mi.bucket_size) >> 1); + s64 fragmented = 0; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + if (data_type_movable(i)) + fragmented += usage_full.d[i].fragmented; + + return max(0LL, fragmented_allowed - fragmented); +} + /* * Copygc runs when the amount of fragmented data is above some arbitrary * threshold: @@ -275,28 +294,13 @@ static int bch2_copygc(struct moving_context *ctxt, * often and continually reduce the amount of fragmented space as the device * fills up. So, we increase the threshold by half the current free space. */ -unsigned long bch2_copygc_wait_amount(struct bch_fs *c) +u64 bch2_copygc_wait_amount(struct bch_fs *c) { - s64 wait = S64_MAX, fragmented_allowed, fragmented; + u64 wait = U64_MAX; rcu_read_lock(); - for_each_rw_member_rcu(c, ca) { - struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); - struct bch_dev_usage usage; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - usage.buckets[i] = usage_full.d[i].buckets; - - fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * - ca->mi.bucket_size) >> 1); - fragmented = 0; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; - - wait = min(wait, max(0LL, fragmented_allowed - fragmented)); - } + for_each_rw_member_rcu(c, ca) + wait = min(wait, bch2_copygc_dev_wait_amount(ca)); rcu_read_unlock(); return wait; @@ -320,14 +324,22 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) c->copygc_wait_at) << 9); prt_newline(out); - prt_printf(out, "Currently calculated wait:\t"); - prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); - prt_newline(out); + bch2_printbuf_make_room(out, 4096); rcu_read_lock(); + out->atomic++; + + prt_printf(out, "Currently calculated wait:\n"); + for_each_rw_member_rcu(c, ca) { + prt_printf(out, " %s:\t", ca->name); + prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); + prt_newline(out); + } + struct task_struct *t = rcu_dereference(c->copygc_thread); if (t) get_task_struct(t); + --out->atomic; rcu_read_unlock(); if (t) { diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index d1885cf67a45..b9683d22bab0 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_MOVINGGC_H #define _BCACHEFS_MOVINGGC_H -unsigned long bch2_copygc_wait_amount(struct bch_fs *); +u64 bch2_copygc_wait_amount(struct bch_fs *); void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); static inline void bch2_copygc_wakeup(struct bch_fs *c) From 82067c916994dd1bfec65496144dc16e17899e36 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 May 2025 23:21:28 -0400 Subject: [PATCH 134/218] bcachefs: buckets_in_flight on stack copygc runs with a full stack available, there's no reason to dynamically allocate this. Signed-off-by: Kent Overstreet --- fs/bcachefs/movinggc.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 6e5680a3a97c..66f4920552c5 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -354,19 +354,13 @@ static int bch2_copygc_thread(void *arg) struct moving_context ctxt; struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight *buckets; + struct buckets_in_flight buckets = {}; u64 last, wait; - int ret = 0; - buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); - if (!buckets) - return -ENOMEM; - ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); + int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); bch_err_msg(c, ret, "allocating copygc buckets in flight"); - if (ret) { - kfree(buckets); + if (ret) return ret; - } set_freezable(); @@ -389,13 +383,13 @@ static int bch2_copygc_thread(void *arg) cond_resched(); if (!c->opts.copygc_enabled) { - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); kthread_wait_freezable(c->opts.copygc_enabled || kthread_should_stop()); } if (unlikely(freezing(current))) { - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); __refrigerator(false); continue; } @@ -406,7 +400,7 @@ static int bch2_copygc_thread(void *arg) if (wait > clock->max_slop) { c->copygc_wait_at = last; c->copygc_wait = last + wait; - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); trace_and_count(c, copygc_wait, c, wait, last + wait); bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); @@ -416,7 +410,7 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; c->copygc_running = true; - ret = bch2_copygc(&ctxt, buckets, &did_work); + ret = bch2_copygc(&ctxt, &buckets, &did_work); c->copygc_running = false; wake_up(&c->copygc_running_wq); @@ -427,16 +421,14 @@ static int bch2_copygc_thread(void *arg) if (min_member_capacity == U64_MAX) min_member_capacity = 128 * 2048; - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), MAX_SCHEDULE_TIMEOUT); } } - move_buckets_wait(&ctxt, buckets, true); - - rhashtable_destroy(&buckets->table); - kfree(buckets); + move_buckets_wait(&ctxt, &buckets, true); + rhashtable_destroy(&buckets.table); bch2_moving_ctxt_exit(&ctxt); bch2_move_stats_exit(&move_stats, c); From 3ffda8c219d636012cfc2b5dae0bf19d831a53e0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 May 2025 15:16:14 -0400 Subject: [PATCH 135/218] bcachefs: kill dead code in move_data_phys() Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index ff56d8886c32..42076aa3438b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -849,10 +849,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); - bch_err_msg(c, ret, "looking up alloc key"); - if (ret) - goto err; - ret = bch2_btree_write_buffer_tryflush(trans); if (!bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "flushing btree write buffer"); From 7f9dada701aa357cecf432cf2f345fd3897f92ed Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 May 2025 15:27:36 -0400 Subject: [PATCH 136/218] bcachefs: delete dead items in bch_dev Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index cd35d1cf3fbb..07a16c473af3 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -631,10 +631,6 @@ struct bch_dev { unsigned nr_partial_buckets; unsigned nr_btree_reserve; - size_t inc_gen_needs_gc; - size_t inc_gen_really_needs_gc; - size_t buckets_waiting_on_journal; - struct work_struct invalidate_work; struct work_struct discard_work; struct mutex discard_buckets_in_flight_lock; From 13ffcbae86dadbf7711f42e4940bafae88a87e1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 May 2025 16:25:21 -0400 Subject: [PATCH 137/218] bcachefs: "buckets with backpointer mismatches" now allocated on demand More self healing work: we're going to be calling check_bucket_backpointer_mismatch() at runtime, outside of fsck. Then when we need to we'll kick off the full check_extents_to_backpointers recovery pass. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 123 ++++++++++++++++++++++--------------- fs/bcachefs/backpointers.h | 4 ++ fs/bcachefs/bcachefs.h | 9 ++- fs/bcachefs/buckets.c | 22 +++++++ fs/bcachefs/super.c | 7 +++ 5 files changed, 113 insertions(+), 52 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index e6178eb2c396..631d4d24d78f 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -15,6 +15,14 @@ #include +static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) +{ + return (struct bbpos) { + .btree = bp.btree_id, + .pos = bp.pos, + }; +} + int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -671,8 +679,22 @@ static int check_extent_to_backpointers(struct btree_trans *trans, rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); - bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); + if (!ca) { + rcu_read_unlock(); + continue; + } + + u64 b = PTR_BUCKET_NR(ca, &p.ptr); + bool set[2]; + + for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) { + unsigned long *bitmap = + READ_ONCE(ca->bucket_backpointer_mismatches[i].buckets); + set[i] = bitmap && test_bit(b, bitmap); + } + + bool check = set[0]; + bool empty = set[1]; bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); rcu_read_unlock(); @@ -724,14 +746,6 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, return ret; } -static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) -{ - return (struct bbpos) { - .btree = bp.btree_id, - .pos = bp.pos, - }; -} - static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; @@ -933,12 +947,25 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b goto err; } - if (!sectors[ALLOC_dirty] && - !sectors[ALLOC_stripe] && - !sectors[ALLOC_cached]) - __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); - else - __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); + bool empty = (sectors[ALLOC_dirty] + + sectors[ALLOC_stripe] + + sectors[ALLOC_cached]) == 0; + + struct bucket_bitmap *bitmap = &ca->bucket_backpointer_mismatches[empty]; + + mutex_lock(&bitmap->lock); + if (!bitmap->buckets) { + bitmap->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), GFP_KERNEL); + if (!bitmap->buckets) { + mutex_unlock(&bitmap->lock); + ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + goto err; + } + } + + bitmap->nr += !__test_and_set_bit(alloc_k.k->p.offset, bitmap->buckets); + mutex_unlock(&bitmap->lock); } err: bch2_dev_put(ca); @@ -962,8 +989,19 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) goto next; struct bpos bucket = bp_pos_to_bucket(ca, pos); - bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, - ca->mi.nbuckets, bucket.offset); + u64 next = ca->mi.nbuckets; + + for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) { + unsigned long *bitmap = + READ_ONCE(ca->bucket_backpointer_mismatches[i].buckets); + if (bitmap) + next = min_t(u64, next, + find_next_bit(bitmap, + ca->mi.nbuckets, + bucket.offset)); + } + + bucket.offset = next; if (bucket.offset == ca->mi.nbuckets) goto next; @@ -1072,28 +1110,6 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) { int ret = 0; - /* - * Can't allow devices to come/go/resize while we have bucket bitmaps - * allocated - */ - down_read(&c->state_lock); - - for_each_member_device(c, ca) { - BUG_ON(ca->bucket_backpointer_mismatches); - ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), - GFP_KERNEL); - ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), - GFP_KERNEL); - if (!ca->bucket_backpointer_mismatches || - !ca->bucket_backpointer_empty) { - bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; - goto err_free_bitmaps; - } - } - struct btree_trans *trans = bch2_trans_get(c); struct extents_to_bp_state s = { .bp_start = POS_MIN }; @@ -1110,8 +1126,8 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; for_each_member_device(c, ca) { nr_buckets += ca->mi.nbuckets; - nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); - nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); + nr_mismatches += ca->bucket_backpointer_mismatches[0].nr; + nr_empty += ca->bucket_backpointer_mismatches[1].nr; } if (!nr_mismatches && !nr_empty) @@ -1153,19 +1169,17 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); bch2_btree_cache_unpin(c); -err_free_bitmaps: - for_each_member_device(c, ca) { - kvfree(ca->bucket_backpointer_empty); - ca->bucket_backpointer_empty = NULL; - kvfree(ca->bucket_backpointer_mismatches); - ca->bucket_backpointer_mismatches = NULL; - } - up_read(&c->state_lock); + for_each_member_device(c, ca) + for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) + bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatches[i]); + bch_err_fn(c, ret); return ret; } +/* backpointers -> extents */ + static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, @@ -1281,3 +1295,12 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) bch_err_fn(c, ret); return ret; } + +void bch2_bucket_bitmap_free(struct bucket_bitmap *b) +{ + mutex_lock(&b->lock); + kvfree(b->buckets); + b->buckets = NULL; + b->nr = 0; + mutex_unlock(&b->lock); +} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 16575dbc5736..c72707ee9d42 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -182,8 +182,12 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_b struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, struct btree_iter *, struct bkey_buf *); +int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bpos, struct bkey_buf *); + int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); int bch2_check_backpointers_to_extents(struct bch_fs *); +void bch2_bucket_bitmap_free(struct bucket_bitmap *); + #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 07a16c473af3..66659dade3f0 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -574,6 +574,12 @@ enum bch_dev_write_ref { BCH_DEV_WRITE_REF_NR, }; +struct bucket_bitmap { + unsigned long *buckets; + u64 nr; + struct mutex lock; +}; + struct bch_dev { struct kobject kobj; #ifdef CONFIG_BCACHEFS_DEBUG @@ -618,8 +624,7 @@ struct bch_dev { u8 *oldest_gen; unsigned long *buckets_nouse; - unsigned long *bucket_backpointer_mismatches; - unsigned long *bucket_backpointer_empty; + struct bucket_bitmap bucket_backpointer_mismatches[2]; struct bch_dev_usage_full __percpu *usage; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 596edc7bba2f..8d6955ef631b 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1324,6 +1324,28 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) sizeof(bucket_gens->b[0]) * copy); } + for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) { + struct bucket_bitmap *bitmap = &ca->bucket_backpointer_mismatches[i]; + + mutex_lock(&bitmap->lock); + if (bitmap->buckets) { + unsigned long *n = kvcalloc(BITS_TO_LONGS(nbuckets), + sizeof(unsigned long), GFP_KERNEL); + if (!n) { + mutex_unlock(&bitmap->lock); + ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + goto err; + } + + memcpy(n, bitmap->buckets, + BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); + kvfree(bitmap->buckets); + bitmap->buckets = n; + + } + mutex_unlock(&bitmap->lock); + } + rcu_assign_pointer(ca->bucket_gens, bucket_gens); bucket_gens = old_bucket_gens; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index dc8189f9d2f1..77b834cfe126 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -11,6 +11,7 @@ #include "alloc_background.h" #include "alloc_foreground.h" #include "async_objs.h" +#include "backpointers.h" #include "bkey_sort.h" #include "btree_cache.h" #include "btree_gc.h" @@ -1341,6 +1342,9 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); + for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) + bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatches[i]); + bch2_free_super(&ca->disk_sb); bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); @@ -1471,6 +1475,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, atomic_long_set(&ca->ref, 1); #endif + for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) + mutex_init(&ca->bucket_backpointer_mismatches[i].lock); + bch2_dev_allocator_background_init(ca); if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || From 3b7b0c3996b570f9c305c6f3df475a719920d65c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 May 2025 23:15:40 -0400 Subject: [PATCH 138/218] bcachefs: print label correctly in sb_member_to_text() Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-members.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index b9568a68fbf6..75184d8e685a 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -222,17 +222,11 @@ static void member_to_text(struct printbuf *out, printbuf_indent_add(out, 2); prt_printf(out, "Label:\t"); - if (BCH_MEMBER_GROUP(&m)) { - unsigned idx = BCH_MEMBER_GROUP(&m) - 1; - - if (idx < disk_groups_nr(gi)) - prt_printf(out, "%s (%u)", - gi->entries[idx].label, idx); - else - prt_printf(out, "(bad disk labels section)"); - } else { + if (BCH_MEMBER_GROUP(&m)) + bch2_disk_path_to_text_sb(out, sb, + BCH_MEMBER_GROUP(&m) - 1); + else prt_printf(out, "(none)"); - } prt_newline(out); prt_printf(out, "UUID:\t"); From 20a4b7f3b802f37e44a3c10f97d6ae1aae4daa4e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 May 2025 18:12:59 -0400 Subject: [PATCH 139/218] bcachefs: recovery_passes_types.h -> recovery_passes_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/bcachefs_format.h | 1 + .../{recovery_passes_types.h => recovery_passes_format.h} | 6 +++--- 3 files changed, 4 insertions(+), 4 deletions(-) rename fs/bcachefs/{recovery_passes_types.h => recovery_passes_format.h} (95%) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 66659dade3f0..b4209e270712 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -213,7 +213,6 @@ #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" -#include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" #include "snapshot_types.h" diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index a483d440fa39..df5a4d4df640 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -510,6 +510,7 @@ struct bch_sb_field { #include "logged_ops_format.h" #include "lru_format.h" #include "quota_format.h" +#include "recovery_passes_format.h" #include "reflink_format.h" #include "replicas_format.h" #include "snapshot_format.h" diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_format.h similarity index 95% rename from fs/bcachefs/recovery_passes_types.h rename to fs/bcachefs/recovery_passes_format.h index be3185fc6ef4..291f58dfbd24 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_format.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H -#define _BCACHEFS_RECOVERY_PASSES_TYPES_H +#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H +#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H #define PASS_SILENT BIT(0) #define PASS_FSCK BIT(1) @@ -81,4 +81,4 @@ enum bch_recovery_pass_stable { #undef x }; -#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ +#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */ From e21f99772112cea57d8389a03d184e69141194b1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 May 2025 18:24:20 -0400 Subject: [PATCH 140/218] bcachefs: bch_sb_field_recovery_passes New superblock section for statistics on recovery passes - last time ran (successfully), last runtime. This will be used by self healing code to determine when to kick off potentially expensive recovery passes. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 +- fs/bcachefs/recovery_passes.c | 177 ++++++++++++++++++--------- fs/bcachefs/recovery_passes.h | 2 + fs/bcachefs/recovery_passes_format.h | 20 +++ 4 files changed, 146 insertions(+), 56 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index df5a4d4df640..5900ff3715c6 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -497,7 +497,8 @@ struct bch_sb_field { x(members_v2, 11) \ x(errors, 12) \ x(ext, 13) \ - x(downgrade, 14) + x(downgrade, 14) \ + x(recovery_passes, 15) #include "alloc_background_format.h" #include "dirent_format.h" diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index e14aca00cb7d..4da3f6463db8 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -28,6 +28,121 @@ const char * const bch2_recovery_passes[] = { NULL }; +static const u8 passes_to_stable_map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +static const u8 passes_from_stable_map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) +{ + return passes_to_stable_map[pass]; +} + +u64 bch2_recovery_passes_to_stable(u64 v) +{ + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(passes_to_stable_map[i]); + return ret; +} + +static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) +{ + return pass < ARRAY_SIZE(passes_from_stable_map) + ? passes_from_stable_map[pass] + : 0; +} + +u64 bch2_recovery_passes_from_stable(u64 v) +{ + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(passes_from_stable_map[i]); + return ret; +} + +static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) +{ + return 0; +} + +static void bch2_sb_recovery_passes_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_recovery_passes *r = + field_to_type(f, recovery_passes); + unsigned nr = recovery_passes_nr_entries(r); + + if (out->nr_tabstops < 1) + printbuf_tabstop_push(out, 32); + if (out->nr_tabstops < 2) + printbuf_tabstop_push(out, 16); + + prt_printf(out, "Pass\tLast run\tLast runtime\n"); + + for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { + if (!i->last_run) + continue; + + unsigned idx = i - r->start; + + prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); + + bch2_prt_datetime(out, le64_to_cpu(i->last_run)); + prt_tab(out); + + bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); + prt_newline(out); + } +} + +static void bch2_sb_recovery_pass_complete(struct bch_fs *c, + enum bch_recovery_pass pass, + s64 start_time) +{ + enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); + s64 end_time = ktime_get_real_seconds(); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __clear_bit_le64(stable, ext->recovery_passes_required); + + struct bch_sb_field_recovery_passes *r = + bch2_sb_field_get(c->disk_sb.sb, recovery_passes); + + if (stable >= recovery_passes_nr_entries(r)) { + unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); + + r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); + if (!r) { + bch_err(c, "error creating recovery_passes sb section"); + goto out; + } + } + + r->start[stable].last_run = cpu_to_le64(end_time); + r->start[stable].last_runtime = cpu_to_le32(max(0, end_time - start_time)); +out: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + +const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { + .validate = bch2_sb_recovery_passes_validate, + .to_text = bch2_sb_recovery_passes_to_text +}; + /* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ static int bch2_recovery_pass_empty(struct bch_fs *c) { @@ -88,41 +203,6 @@ static struct recovery_pass_fn recovery_pass_fns[] = { #undef x }; -static const u8 passes_to_stable_map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x -}; - -static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) -{ - return passes_to_stable_map[pass]; -} - -u64 bch2_recovery_passes_to_stable(u64 v) -{ - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(passes_to_stable_map[i]); - return ret; -} - -u64 bch2_recovery_passes_from_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; -} - /* * For when we need to rewind recovery passes and run a pass we skipped: */ @@ -219,21 +299,6 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, return ret; } -static void bch2_clear_recovery_pass_required(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - if (test_bit_le64(s, ext->recovery_passes_required)) { - __clear_bit_le64(s, ext->recovery_passes_required); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); -} - u64 bch2_fsck_recovery_passes(void) { u64 ret = 0; @@ -266,14 +331,19 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { struct recovery_pass_fn *p = recovery_pass_fns + pass; - int ret; if (!(p->when & PASS_SILENT)) bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), bch2_recovery_passes[pass]); - ret = p->fn(c); + + s64 start_time = ktime_get_real_seconds(); + int ret = p->fn(c); if (ret) return ret; + + if (!test_bit(BCH_FS_error, &c->flags)) + bch2_sb_recovery_pass_complete(c, pass, start_time); + if (!(p->when & PASS_SILENT)) bch2_print(c, KERN_CONT " done\n"); @@ -326,9 +396,6 @@ int bch2_run_recovery_passes(struct bch_fs *c) spin_unlock_irq(&c->recovery_pass_lock); ret = bch2_run_recovery_pass(c, pass) ?: bch2_journal_flush(&c->journal); - - if (!ret && !test_bit(BCH_FS_error, &c->flags)) - bch2_clear_recovery_pass_required(c, pass); spin_lock_irq(&c->recovery_pass_lock); if (c->next_recovery_pass < c->curr_recovery_pass) { diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index f33dd005beb4..d39856f908da 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -3,6 +3,8 @@ extern const char * const bch2_recovery_passes[]; +extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes; + u64 bch2_recovery_passes_to_stable(u64 v); u64 bch2_recovery_passes_from_stable(u64 v); diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h index 291f58dfbd24..c434eafbca19 100644 --- a/fs/bcachefs/recovery_passes_format.h +++ b/fs/bcachefs/recovery_passes_format.h @@ -81,4 +81,24 @@ enum bch_recovery_pass_stable { #undef x }; +struct recovery_pass_entry { + __le64 last_run; + __le32 last_runtime; + __le32 flags; +}; + +struct bch_sb_field_recovery_passes { + struct bch_sb_field field; + struct recovery_pass_entry start[]; +}; + +static inline unsigned +recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r) +{ + return r + ? ((vstruct_end(&r->field) - (void *) &r->start[0]) / + sizeof(struct recovery_pass_entry)) + : 0; +} + #endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */ From 001c1d146f38620d6d969b66421460a5d8fd966d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 May 2025 23:22:23 -0400 Subject: [PATCH 141/218] bcachefs: online_fsck_mutex -> run_recovery_passes_lock Prep work for automatically running recovery passes asynchronously. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 +- fs/bcachefs/fsck.c | 6 +++--- fs/bcachefs/recovery_passes.c | 8 ++++++++ fs/bcachefs/recovery_passes.h | 2 ++ fs/bcachefs/super.c | 4 ++-- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b4209e270712..5392a0ec6439 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1128,7 +1128,7 @@ struct bch_fs { /* never rewinds version of curr_recovery_pass */ enum bch_recovery_pass recovery_pass_done; spinlock_t recovery_pass_lock; - struct semaphore online_fsck_mutex; + struct semaphore run_recovery_passes_lock; /* DEBUG JUNK */ struct dentry *fs_debug_dir; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 942a03d06074..7c10325a1fac 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3204,7 +3204,7 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->stdio_filter = NULL; c->opts.fix_errors = old_fix_errors; - up(&c->online_fsck_mutex); + up(&c->run_recovery_passes_lock); bch2_ro_ref_put(c); return ret; } @@ -3228,7 +3228,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) if (!bch2_ro_ref_tryget(c)) return -EROFS; - if (down_trylock(&c->online_fsck_mutex)) { + if (down_trylock(&c->run_recovery_passes_lock)) { bch2_ro_ref_put(c); return -EAGAIN; } @@ -3260,7 +3260,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) bch_err_fn(c, ret); if (thr) bch2_fsck_thread_exit(&thr->thr); - up(&c->online_fsck_mutex); + up(&c->run_recovery_passes_lock); bch2_ro_ref_put(c); } return ret; diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 4da3f6463db8..5f2bd8c10ce6 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -380,6 +380,7 @@ int bch2_run_recovery_passes(struct bch_fs *c) */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + down(&c->run_recovery_passes_lock); spin_lock_irq(&c->recovery_pass_lock); while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { @@ -423,6 +424,13 @@ int bch2_run_recovery_passes(struct bch_fs *c) } spin_unlock_irq(&c->recovery_pass_lock); + up(&c->run_recovery_passes_lock); return ret; } + +void bch2_fs_recovery_passes_init(struct bch_fs *c) +{ + spin_lock_init(&c->recovery_pass_lock); + sema_init(&c->run_recovery_passes_lock, 1); +} diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index d39856f908da..4c03472be5b9 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -20,4 +20,6 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf int bch2_run_online_recovery_passes(struct bch_fs *); int bch2_run_recovery_passes(struct bch_fs *); +void bch2_fs_recovery_passes_init(struct bch_fs *); + #endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 77b834cfe126..95ed5ab541d3 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -51,6 +51,7 @@ #include "quota.h" #include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-clean.h" #include "sb-counters.h" @@ -848,8 +849,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, refcount_set(&c->ro_ref, 1); init_waitqueue_head(&c->ro_ref_wait); - spin_lock_init(&c->recovery_pass_lock); - sema_init(&c->online_fsck_mutex, 1); for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); @@ -869,6 +868,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_fs_move_init(c); bch2_fs_nocow_locking_init_early(c); bch2_fs_quota_init(c); + bch2_fs_recovery_passes_init(c); bch2_fs_sb_errors_init_early(c); bch2_fs_snapshots_init_early(c); bch2_fs_subvolumes_init_early(c); From b51b4055c3cd516cba9e0aee3d8ecfba1d75c047 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 13:24:25 -0400 Subject: [PATCH 142/218] bcachefs: Slim down inlined part of bch2_btree_path_upgrade() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 22 +++++++++++++--------- fs/bcachefs/btree_locking.h | 19 +++++-------------- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index baa505a9a706..448613be90ba 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -618,22 +618,22 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, unsigned new_locks_want, struct get_locks_fail *f) { - EBUG_ON(path->locks_want >= new_locks_want); - - path->locks_want = new_locks_want; + path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); bool ret = btree_path_get_locks(trans, path, true, f); bch2_trans_verify_locks(trans); return ret; } -bool __bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want, - struct get_locks_fail *f) +int __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { - bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); - if (ret) + struct get_locks_fail f = {}; + unsigned old_locks_want = path->locks_want; + int ret = 0; + + if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, &f)) goto out; /* @@ -668,6 +668,10 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, btree_path_get_locks(trans, linked, true, NULL); } } + + trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, + old_locks_want, new_locks_want, &f); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); out: bch2_trans_verify_locks(trans); return ret; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 66b27c0853a5..59000d0dabea 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -380,27 +380,18 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, struct btree_path *, unsigned, struct get_locks_fail *); -bool __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned, - struct get_locks_fail *); +int __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned); static inline int bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - struct get_locks_fail f = {}; - unsigned old_locks_want = path->locks_want; - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - if (path->locks_want < new_locks_want - ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) - : path->nodes_locked) - return 0; - - trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, - old_locks_want, new_locks_want, &f); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + return likely(path->locks_want >= new_locks_want && path->nodes_locked) + ? 0 + : __bch2_btree_path_upgrade(trans, path, new_locks_want); } /* misc: */ From 284251557562f6e8f0ce439cb52a6b9645b4e9a4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 14:14:06 -0400 Subject: [PATCH 143/218] bcachefs: Debug params are now static_keys We'd like users to be able to debug without building custom kernels, so this will help us get rid of CONFIG_BCACHEFS_DEBUG, at least for most things. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 4 +-- fs/bcachefs/backpointers.h | 2 +- fs/bcachefs/bcachefs.h | 10 ++----- fs/bcachefs/bkey.h | 2 +- fs/bcachefs/bkey_methods.c | 2 +- fs/bcachefs/bset.c | 12 ++++----- fs/bcachefs/bset.h | 2 +- fs/bcachefs/btree_cache.c | 6 ++--- fs/bcachefs/btree_gc.c | 2 +- fs/bcachefs/btree_io.c | 4 +-- fs/bcachefs/btree_iter.c | 8 +++--- fs/bcachefs/btree_trans_commit.c | 4 +-- fs/bcachefs/btree_update_interior.h | 2 +- fs/bcachefs/debug.h | 2 +- fs/bcachefs/extents.c | 4 +-- fs/bcachefs/super.c | 42 ++++++++++++++++++++++++++--- 16 files changed, 69 insertions(+), 39 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 631d4d24d78f..bdf524b465fa 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -182,7 +182,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) { - return (likely(!bch2_backpointers_no_use_write_buffer) + return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); @@ -192,7 +192,7 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, struct bkey_s_c visiting_k, struct bkey_buf *last_flushed) { - return likely(!bch2_backpointers_no_use_write_buffer) + return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) : 0; } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index c72707ee9d42..f57098c32143 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -102,7 +102,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, struct bkey_i_backpointer *bp, bool insert) { - if (unlikely(bch2_backpointers_no_use_write_buffer)) + if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)) return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); if (!insert) { diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 5392a0ec6439..46976409f733 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -439,16 +439,10 @@ do { \ #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() #endif -#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -BCH_DEBUG_PARAMS() +#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name; +BCH_DEBUG_PARAMS_ALL() #undef BCH_DEBUG_PARAM -#ifndef CONFIG_BCACHEFS_DEBUG -#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name; -BCH_DEBUG_PARAMS_DEBUG() -#undef BCH_DEBUG_PARAM -#endif - #define BCH_TIME_STATS() \ x(btree_node_mem_alloc) \ x(btree_node_split) \ diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 082632905649..b33356982460 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -399,7 +399,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, unpack_fn(dst, src); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - bch2_expensive_debug_checks) { + static_branch_unlikely(&bch2_expensive_debug_checks)) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 00d05ccfaf73..fcd8c82cba4f 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -356,7 +356,7 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return ops->key_merge && bch2_bkey_maybe_mergable(l.k, r.k) && (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && - !bch2_key_merging_disabled && + !static_branch_unlikely(&bch2_key_merging_disabled) && ops->key_merge(c, l, r); } diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 9a4a83d6fd2d..7d2004a47fe6 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -501,7 +501,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, struct bkey_packed *k = btree_bkey_first(b, t); unsigned j = 0; - if (!bch2_expensive_debug_checks) + if (!static_branch_unlikely(&bch2_expensive_debug_checks)) return; BUG_ON(bset_has_ro_aux_tree(t)); @@ -869,7 +869,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, k = p; } - if (bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_expensive_debug_checks)) { BUG_ON(ret >= orig_k); for (i = ret @@ -1195,7 +1195,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, bkey_iter_pos_cmp(b, m, search) < 0) m = bkey_p_next(m); - if (bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_expensive_debug_checks)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && @@ -1435,7 +1435,7 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { - if (bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_expensive_debug_checks)) { bch2_btree_node_iter_verify(iter, b); bch2_btree_node_iter_next_check(iter, b); } @@ -1453,7 +1453,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree_node_iter_set *set; unsigned end = 0; - if (bch2_expensive_debug_checks) + if (static_branch_unlikely(&bch2_expensive_debug_checks)) bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { @@ -1489,7 +1489,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - if (bch2_expensive_debug_checks) + if (static_branch_unlikely(&bch2_expensive_debug_checks)) bch2_btree_node_iter_verify(iter, b); return prev; } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 6953d55b72cc..f5b7fda537ea 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -537,7 +537,7 @@ static inline void bch2_verify_insert_pos(struct btree *b, static inline void bch2_verify_btree_nr_keys(struct btree *b) { - if (bch2_debug_check_btree_accounting) + if (static_branch_unlikely(&bch2_debug_check_btree_accounting)) __bch2_verify_btree_nr_keys(b); } diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index e48089252bb9..2fd58b08a54d 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -377,7 +377,7 @@ static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, * - unless btree verify mode is enabled, since it runs out of * the post write cleanup: */ - if (bch2_verify_btree_ondisk) + if (static_branch_unlikely(&bch2_verify_btree_ondisk)) bch2_btree_node_write(c, b, SIX_LOCK_intent, BTREE_WRITE_cache_reclaim); else @@ -473,7 +473,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long ret = SHRINK_STOP; bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; - if (bch2_btree_shrinker_disabled) + if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return SHRINK_STOP; mutex_lock(&bc->lock); @@ -569,7 +569,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, { struct btree_cache_list *list = shrink->private_data; - if (bch2_btree_shrinker_disabled) + if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return 0; return btree_cache_can_free(list); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index dd08ec080313..91b6395421df 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -619,7 +619,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, deleted.p = k.k->p; if (initial) { - BUG_ON(bch2_journal_seq_verify && + BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) && k.k->bversion.lo > atomic64_read(&c->journal.seq)); if (fsck_err_on(btree_id != BTREE_ID_accounting && diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index e079e12adf86..d9adc4f5ba3d 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1296,7 +1296,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); if (ret == -BCH_ERR_fsck_delete_bkey || - (bch2_inject_invalid_keys && + (static_branch_unlikely(&bch2_inject_invalid_keys) && !bversion_cmp(u.k->bversion, MAX_VERSION))) { btree_keys_account_key_drop(&b->nr, 0, k); @@ -1758,7 +1758,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, trace_and_count(c, btree_node_read, trans, b); - if (bch2_verify_all_btree_replicas && + if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && !btree_node_read_all_replicas(c, b, sync)) return; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index bd3a0bc07511..be62958cdb9a 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -147,7 +147,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, struct printbuf buf3 = PRINTBUF; const char *msg; - if (!bch2_debug_check_iterators) + if (!static_branch_unlikely(&bch2_debug_check_iterators)) return; l = &path->l[level]; @@ -281,7 +281,7 @@ static int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct bkey_s_c prev; int ret = 0; - if (!bch2_debug_check_iterators) + if (!static_branch_unlikely(&bch2_debug_check_iterators)) return 0; if (!(iter->flags & BTREE_ITER_filter_snapshots)) @@ -523,7 +523,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, __bch2_btree_node_iter_fix(path, b, node_iter, t, where, clobber_u64s, new_u64s); - if (bch2_debug_check_iterators) + if (static_branch_unlikely(&bch2_debug_check_iterators)) bch2_btree_node_iter_verify(node_iter, b); } @@ -2929,7 +2929,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) struct btree_path *path, *prev = NULL; struct trans_for_each_path_inorder_iter iter; - if (!bch2_debug_check_iterators) + if (!static_branch_unlikely(&bch2_debug_check_iterators)) return; trans_for_each_path_inorder(trans, path, iter) { diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index cdde769e7da3..7e17df1df7f1 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -647,10 +647,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && !(flags & BCH_TRANS_COMMIT_no_journal_res)) { - if (bch2_journal_seq_verify) + if (static_branch_unlikely(&bch2_journal_seq_verify)) trans_for_each_update(trans, i) i->k->k.bversion.lo = trans->journal_res.seq; - else if (bch2_inject_invalid_keys) + else if (static_branch_unlikely(&bch2_inject_invalid_keys)) trans_for_each_update(trans, i) i->k->k.bversion = MAX_VERSION; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index ff9b95aac554..7fe793788a79 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -144,7 +144,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, EBUG_ON(!btree_node_locked(path, level)); - if (bch2_btree_node_merging_disabled) + if (static_branch_unlikely(&bch2_btree_node_merging_disabled)) return 0; b = path->l[level].b; diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h index 52dbea736709..d88b1194b8ac 100644 --- a/fs/bcachefs/debug.h +++ b/fs/bcachefs/debug.h @@ -14,7 +14,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) { - if (bch2_verify_btree_ondisk) + if (static_branch_unlikely(&bch2_verify_btree_ondisk)) __bch2_btree_verify(c, b); } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index d3af841e48ef..1ac9897f189d 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -164,7 +164,7 @@ static inline bool ptr_better(struct bch_fs *c, if (unlikely(failed_delta)) return failed_delta < 0; - if (unlikely(bch2_force_reconstruct_read)) + if (static_branch_unlikely(&bch2_force_reconstruct_read)) return p1.do_ec_reconstruct > p2.do_ec_reconstruct; if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) @@ -259,7 +259,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, p.do_ec_reconstruct = true; } - if (bch2_force_reconstruct_read && p.has_ec) + if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec) p.do_ec_reconstruct = true; u64 p_latency = dev_latency(ca); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 95ed5ab541d3..8125c6804bd5 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -2430,9 +2430,45 @@ static int __init bcachefs_init(void) return -ENOMEM; } -#define BCH_DEBUG_PARAM(name, description) \ - bool bch2_##name; \ - module_param_named(name, bch2_##name, bool, 0644); \ +#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); +BCH_DEBUG_PARAMS_ALL() +#undef BCH_DEBUG_PARAM + +static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) +{ + /* Match bool exactly, by re-using it. */ + struct static_key *key = kp->arg; + struct kernel_param boolkp = *kp; + bool v; + int ret; + + boolkp.arg = &v; + + ret = param_set_bool(val, &boolkp); + if (ret) + return ret; + if (v) + static_key_enable(key); + else + static_key_disable(key); + return 0; +} + +static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) +{ + struct static_key *key = kp->arg; + return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); +} + +static const struct kernel_param_ops bch2_param_ops_static_key_t = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = bch2_param_set_static_key_t, + .get = bch2_param_get_static_key_t, +}; + +#define BCH_DEBUG_PARAM(name, description) \ + module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ + __MODULE_PARM_TYPE(name, "static_key_t"); \ MODULE_PARM_DESC(name, description); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM From 110bb6cb8b48bd6736d8e7c08a4236d443f8064a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 15:12:13 -0400 Subject: [PATCH 144/218] bcachefs: debug_check_btree_locking modparam Don't put btree locking asserts behind CONFIG_BCACHEFS_DEBUG, put them behind a module parameter. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 ++ fs/bcachefs/btree_locking.c | 10 +++------- fs/bcachefs/btree_locking.h | 21 ++++++++++++++------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 46976409f733..ae6b743ae014 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -393,6 +393,8 @@ do { \ "Disables rewriting of btree nodes during mark and sweep")\ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ "Disables the shrinker callback for the btree node cache")\ + BCH_DEBUG_PARAM(verify_btree_locking, \ + "Enable additional asserts for btree locking") \ BCH_DEBUG_PARAM(verify_btree_ondisk, \ "Reread btree nodes at various points to verify the " \ "mergesort in the read path against modifications " \ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 448613be90ba..a45cfae8f671 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -839,9 +839,7 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, /* Debug */ -#ifdef CONFIG_BCACHEFS_DEBUG - -void bch2_btree_path_verify_locks(struct btree_path *path) +void __bch2_btree_path_verify_locks(struct btree_path *path) { /* * A path may be uptodate and yet have nothing locked if and only if @@ -882,7 +880,7 @@ static bool bch2_trans_locked(struct btree_trans *trans) return false; } -void bch2_trans_verify_locks(struct btree_trans *trans) +void __bch2_trans_verify_locks(struct btree_trans *trans) { if (!trans->locked) { BUG_ON(bch2_trans_locked(trans)); @@ -893,7 +891,5 @@ void bch2_trans_verify_locks(struct btree_trans *trans) unsigned i; trans_for_each_path(trans, path, i) - bch2_btree_path_verify_locks(path); + __bch2_btree_path_verify_locks(path); } - -#endif diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 59000d0dabea..1bb28e21d021 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -429,12 +429,19 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_path_verify_locks(struct btree_path *); -void bch2_trans_verify_locks(struct btree_trans *); -#else -static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} -static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} -#endif +void __bch2_btree_path_verify_locks(struct btree_path *); +void __bch2_trans_verify_locks(struct btree_trans *); + +static inline void bch2_btree_path_verify_locks(struct btree_path *path) +{ + if (static_branch_unlikely(&bch2_debug_check_btree_locking)) + __bch2_btree_path_verify_locks(path); +} + +static inline void bch2_trans_verify_locks(struct btree_trans *trans) +{ + if (static_branch_unlikely(&bch2_debug_check_btree_locking)) + __bch2_trans_verify_locks(trans); +} #endif /* _BCACHEFS_BTREE_LOCKING_H */ From c4e38894407d27742c1fc8d0d64d8145c83077d5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 15:25:56 -0400 Subject: [PATCH 145/218] bcachefs: debug_check_iterators no longer requires BCACHEFS_DEBUG Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 10 ++--- fs/bcachefs/btree_iter.c | 81 +++++++++++++++++++++++----------------- fs/bcachefs/btree_iter.h | 25 +++++++++---- 3 files changed, 69 insertions(+), 47 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index ae6b743ae014..a3900fb08f92 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -393,8 +393,6 @@ do { \ "Disables rewriting of btree nodes during mark and sweep")\ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ "Disables the shrinker callback for the btree node cache")\ - BCH_DEBUG_PARAM(verify_btree_locking, \ - "Enable additional asserts for btree locking") \ BCH_DEBUG_PARAM(verify_btree_ondisk, \ "Reread btree nodes at various points to verify the " \ "mergesort in the read path against modifications " \ @@ -404,15 +402,17 @@ do { \ "compare them") \ BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ "Don't use the write buffer for backpointers, enabling "\ - "extra runtime checks") + "extra runtime checks") \ + BCH_DEBUG_PARAM(debug_check_btree_locking, \ + "Enable additional asserts for btree locking") \ + BCH_DEBUG_PARAM(debug_check_iterators, \ + "Enables extra verification for btree iterators") /* Parameters that should only be compiled in debug mode: */ #define BCH_DEBUG_PARAMS_DEBUG() \ BCH_DEBUG_PARAM(expensive_debug_checks, \ "Enables various runtime debugging checks that " \ "significantly affect performance") \ - BCH_DEBUG_PARAM(debug_check_iterators, \ - "Enables extra verification for btree iterators") \ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ "Verify btree accounting for keys within a node") \ BCH_DEBUG_PARAM(journal_seq_verify, \ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index be62958cdb9a..55f4169ce0c9 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -114,11 +114,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path, !btree_path_pos_after_node(path, b); } -/* Btree iterator: */ +/* Debug: */ -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_btree_path_verify_cached(struct btree_trans *trans, +static void __bch2_btree_path_verify_cached(struct btree_trans *trans, struct btree_path *path) { struct bkey_cached *ck; @@ -135,7 +133,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans, btree_node_unlock(trans, path, 0); } -static void bch2_btree_path_verify_level(struct btree_trans *trans, +static void __bch2_btree_path_verify_level(struct btree_trans *trans, struct btree_path *path, unsigned level) { struct btree_path_level *l; @@ -147,16 +145,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, struct printbuf buf3 = PRINTBUF; const char *msg; - if (!static_branch_unlikely(&bch2_debug_check_iterators)) - return; - l = &path->l[level]; tmp = l->iter; locked = btree_node_locked(path, level); if (path->cached) { if (!level) - bch2_btree_path_verify_cached(trans, path); + __bch2_btree_path_verify_cached(trans, path); return; } @@ -217,7 +212,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, msg, level, buf1.buf, buf2.buf, buf3.buf); } -static void bch2_btree_path_verify(struct btree_trans *trans, +static void __bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; @@ -229,22 +224,22 @@ static void bch2_btree_path_verify(struct btree_trans *trans, break; } - bch2_btree_path_verify_level(trans, path, i); + __bch2_btree_path_verify_level(trans, path, i); } bch2_btree_path_verify_locks(path); } -void bch2_trans_verify_paths(struct btree_trans *trans) +void __bch2_trans_verify_paths(struct btree_trans *trans) { struct btree_path *path; unsigned iter; trans_for_each_path(trans, path, iter) - bch2_btree_path_verify(trans, path); + __bch2_btree_path_verify(trans, path); } -static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) +static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) { BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); @@ -256,11 +251,11 @@ static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) - bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); - bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); + __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); + __bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); } -static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && !iter->pos.snapshot); @@ -274,16 +269,13 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) bkey_gt(iter->pos, iter->k.p))); } -static int bch2_btree_iter_verify_ret(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k) +static int __bch2_btree_iter_verify_ret(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k) { struct btree_iter copy; struct bkey_s_c prev; int ret = 0; - if (!static_branch_unlikely(&bch2_debug_check_iterators)) - return 0; - if (!(iter->flags & BTREE_ITER_filter_snapshots)) return 0; @@ -324,7 +316,7 @@ static int bch2_btree_iter_verify_ret(struct btree_trans *trans, return ret; } -void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, +void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos) { bch2_trans_verify_not_unlocked_or_in_restart(trans); @@ -357,19 +349,40 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); } -#else - static inline void bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned l) {} -static inline void bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) {} -static inline void bch2_btree_iter_verify(struct btree_trans *trans, - struct btree_iter *iter) {} -static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} -static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) { return 0; } + struct btree_path *path, unsigned l) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_path_verify_level(trans, path, l); +} -#endif +static inline void bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_path_verify(trans, path); +} + +static inline void bch2_btree_iter_verify(struct btree_trans *trans, + struct btree_iter *iter) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_iter_verify(trans, iter); +} + +static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_iter_verify_entry_exit(iter); +} + +static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + return static_branch_unlikely(&bch2_debug_check_iterators) + ? __bch2_btree_iter_verify_ret(trans, iter, k) + : 0; +} /* Btree path: fixups after btree updates */ diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 78a805a89860..cafd35a5e7a3 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -285,14 +285,23 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex : __bch2_trans_mutex_lock(trans, lock); } -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_trans_verify_paths(struct btree_trans *); -void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); -#else -static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} -static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos) {} -#endif +/* Debug: */ + +void __bch2_trans_verify_paths(struct btree_trans *); +void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); + +static inline void bch2_trans_verify_paths(struct btree_trans *trans) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_trans_verify_paths(trans); +} + +static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree, + struct bpos pos) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_assert_pos_locked(trans, btree, pos); +} void bch2_btree_path_fix_key_modified(struct btree_trans *trans, struct btree *, struct bkey_packed *); From 34aeb820f900529bd40680399d379fa04a952850 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 15:49:38 -0400 Subject: [PATCH 146/218] bcachefs: debug_check_bset_lookups Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 13 +++++---- fs/bcachefs/bkey.h | 3 +- fs/bcachefs/bset.c | 64 +++++++++++++++++++++++------------------- fs/bcachefs/bset.h | 20 ++++--------- 4 files changed, 49 insertions(+), 51 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a3900fb08f92..076520beacd6 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -406,15 +406,16 @@ do { \ BCH_DEBUG_PARAM(debug_check_btree_locking, \ "Enable additional asserts for btree locking") \ BCH_DEBUG_PARAM(debug_check_iterators, \ - "Enables extra verification for btree iterators") + "Enables extra verification for btree iterators") \ + BCH_DEBUG_PARAM(debug_check_bset_lookups, \ + "Enables extra verification for bset lookups") \ + BCH_DEBUG_PARAM(debug_check_btree_accounting, \ + "Verify btree accounting for keys within a node") \ + BCH_DEBUG_PARAM(debug_check_bkey_unpack, \ + "Enables extra verification for bkey unpack") /* Parameters that should only be compiled in debug mode: */ #define BCH_DEBUG_PARAMS_DEBUG() \ - BCH_DEBUG_PARAM(expensive_debug_checks, \ - "Enables various runtime debugging checks that " \ - "significantly affect performance") \ - BCH_DEBUG_PARAM(debug_check_btree_accounting, \ - "Verify btree accounting for keys within a node") \ BCH_DEBUG_PARAM(journal_seq_verify, \ "Store the journal sequence number in the version " \ "number of every btree key, and verify that btree " \ diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index b33356982460..3ccd521c190a 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -398,8 +398,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, compiled_unpack_fn unpack_fn = b->aux_data; unpack_fn(dst, src); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - static_branch_unlikely(&bch2_expensive_debug_checks)) { + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 7d2004a47fe6..32841f762eb2 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -144,8 +144,6 @@ struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) return nr; } -#ifdef CONFIG_BCACHEFS_DEBUG - void __bch2_verify_btree_nr_keys(struct btree *b) { struct btree_nr_keys nr = bch2_btree_node_count_keys(b); @@ -153,7 +151,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } -static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, +static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, struct btree *b) { struct btree_node_iter iter = *_iter; @@ -190,8 +188,8 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, } } -void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) +void __bch2_btree_node_iter_verify(struct btree_node_iter *iter, + struct btree *b) { struct btree_node_iter_set *set, *s2; struct bkey_packed *k, *p; @@ -237,8 +235,8 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, } } -void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, - struct bkey_packed *insert, unsigned clobber_u64s) +static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + struct bkey_packed *insert, unsigned clobber_u64s) { struct bset_tree *t = bch2_bkey_to_bset(b, where); struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); @@ -285,12 +283,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, #endif } -#else +static inline void bch2_verify_insert_pos(struct btree *b, + struct bkey_packed *where, + struct bkey_packed *insert, + unsigned clobber_u64s) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_verify_insert_pos(b, where, insert, clobber_u64s); +} -static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b) {} - -#endif /* Auxiliary search trees */ @@ -361,9 +362,8 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(struct btree *b) +static void __bset_aux_tree_verify(struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG for_each_bset(b, t) { if (t->aux_data_offset == U16_MAX) continue; @@ -375,7 +375,12 @@ static void bset_aux_tree_verify(struct btree *b) BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); } -#endif +} + +static inline void bset_aux_tree_verify(struct btree *b) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bset_aux_tree_verify(b); } void bch2_btree_keys_init(struct btree *b) @@ -495,15 +500,11 @@ static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, }; } -static void bch2_bset_verify_rw_aux_tree(struct btree *b, - struct bset_tree *t) +static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *k = btree_bkey_first(b, t); unsigned j = 0; - if (!static_branch_unlikely(&bch2_expensive_debug_checks)) - return; - BUG_ON(bset_has_ro_aux_tree(t)); if (!bset_has_rw_aux_tree(t)) @@ -530,6 +531,13 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, } } +static inline void bch2_bset_verify_rw_aux_tree(struct btree *b, + struct bset_tree *t) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_bset_verify_rw_aux_tree(b, t); +} + /* returns idx of first entry >= offset: */ static unsigned rw_aux_tree_bsearch(struct btree *b, struct bset_tree *t, @@ -869,7 +877,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, k = p; } - if (static_branch_unlikely(&bch2_expensive_debug_checks)) { + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { BUG_ON(ret >= orig_k); for (i = ret @@ -1195,7 +1203,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, bkey_iter_pos_cmp(b, m, search) < 0) m = bkey_p_next(m); - if (static_branch_unlikely(&bch2_expensive_debug_checks)) { + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && @@ -1435,9 +1443,9 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { - if (static_branch_unlikely(&bch2_expensive_debug_checks)) { - bch2_btree_node_iter_verify(iter, b); - bch2_btree_node_iter_next_check(iter, b); + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { + __bch2_btree_node_iter_verify(iter, b); + __bch2_btree_node_iter_next_check(iter, b); } __bch2_btree_node_iter_advance(iter, b); @@ -1453,8 +1461,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree_node_iter_set *set; unsigned end = 0; - if (static_branch_unlikely(&bch2_expensive_debug_checks)) - bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { k = bch2_bkey_prev_all(b, t, @@ -1489,8 +1496,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - if (static_branch_unlikely(&bch2_expensive_debug_checks)) - bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_verify(iter, b); return prev; } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index f5b7fda537ea..a15ecf9d006e 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -517,23 +517,15 @@ void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); void bch2_dump_btree_node(struct bch_fs *, struct btree *); void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); -#ifdef CONFIG_BCACHEFS_DEBUG - void __bch2_verify_btree_nr_keys(struct btree *); -void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, - struct bkey_packed *, unsigned); +void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -#else - -static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) {} -static inline void bch2_verify_insert_pos(struct btree *b, - struct bkey_packed *where, - struct bkey_packed *insert, - unsigned clobber_u64s) {} -#endif + struct btree *b) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_btree_node_iter_verify(iter, b); +} static inline void bch2_verify_btree_nr_keys(struct btree *b) { From 5b1247ca5f286c50dfe58d461c91bc20fe80a749 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 15:53:10 -0400 Subject: [PATCH 147/218] bcachefs: debug_check_bkey_unpack Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.c | 47 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 995ba32e9b6e..ee823c640642 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -47,11 +47,9 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, } } -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) +static void __bch2_bkey_pack_verify(const struct bkey_packed *packed, + const struct bkey *unpacked, + const struct bkey_format *format) { struct bkey tmp; @@ -95,11 +93,13 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, } } -#else static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) {} -#endif + const struct bkey *unpacked, + const struct bkey_format *format) +{ + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) + __bch2_bkey_pack_verify(packed, unpacked, format); +} struct pack_state { const struct bkey_format *format; @@ -398,7 +398,6 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) return ret; } -#ifdef CONFIG_BCACHEFS_DEBUG static bool bkey_packed_successor(struct bkey_packed *out, const struct btree *b, struct bkey_packed k) @@ -455,7 +454,6 @@ static bool bkey_format_has_too_big_fields(const struct bkey_format *f) return false; } -#endif /* * Returns a packed key that compares <= in @@ -472,9 +470,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, const struct bkey_format *f = &b->format; struct pack_state state = pack_state_init(f, out); u64 *w = out->_data; -#ifdef CONFIG_BCACHEFS_DEBUG struct bpos orig = in; -#endif bool exact = true; unsigned i; @@ -527,18 +523,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, out->format = KEY_FORMAT_LOCAL_BTREE; out->type = KEY_TYPE_deleted; -#ifdef CONFIG_BCACHEFS_DEBUG - if (exact) { - BUG_ON(bkey_cmp_left_packed(b, out, &orig)); - } else { - struct bkey_packed successor; + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { + if (exact) { + BUG_ON(bkey_cmp_left_packed(b, out, &orig)); + } else { + struct bkey_packed successor; - BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); - BUG_ON(bkey_packed_successor(&successor, b, *out) && - bkey_cmp_left_packed(b, &successor, &orig) < 0 && - !bkey_format_has_too_big_fields(f)); + BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); + BUG_ON(bkey_packed_successor(&successor, b, *out) && + bkey_cmp_left_packed(b, &successor, &orig) < 0 && + !bkey_format_has_too_big_fields(f)); + } } -#endif return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; } @@ -627,14 +623,13 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) } } -#ifdef CONFIG_BCACHEFS_DEBUG - { + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { struct printbuf buf = PRINTBUF; BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); printbuf_exit(&buf); } -#endif + return ret; } From 367cad09664aaf2d37e1b694eab6a14d0b5dedef Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 17:16:11 -0400 Subject: [PATCH 148/218] bcachefs: Rename fsck_running, recovery_running flags Slightly more readable. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 4 ++-- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/error.c | 8 ++++---- fs/bcachefs/fsck.c | 4 ++-- fs/bcachefs/recovery.c | 10 +++++----- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 076520beacd6..27c025c05f8e 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -677,8 +677,8 @@ struct bch_dev { x(going_ro) \ x(write_disable_complete) \ x(clean_shutdown) \ - x(recovery_running) \ - x(fsck_running) \ + x(in_recovery) \ + x(in_fsck) \ x(initial_gc_unfixed) \ x(need_delete_dead_snapshots) \ x(error) \ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index d9adc4f5ba3d..a3250241e13e 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -580,7 +580,7 @@ static int __btree_err(int ret, bool print_deferred = err_msg && rw == READ && - !(test_bit(BCH_FS_fsck_running, &c->flags) && + !(test_bit(BCH_FS_in_fsck, &c->flags) && c->opts.fix_errors == FSCK_FIX_ask); struct printbuf out = PRINTBUF; diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index d7bc70fd7762..03567c559623 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -100,7 +100,7 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) prt_printf(out, "btree topology error: "); set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_recovery_running, &c->flags)) { + if (!test_bit(BCH_FS_in_recovery, &c->flags)) { __bch2_inconsistent_error(c, out); return -BCH_ERR_btree_need_topology_repair; } else { @@ -400,7 +400,7 @@ int bch2_fsck_err_opt(struct bch_fs *c, if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) flags |= fsck_flags_extra[err]; - if (test_bit(BCH_FS_fsck_running, &c->flags)) { + if (test_bit(BCH_FS_in_fsck, &c->flags)) { if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) return -BCH_ERR_fsck_repair_unimplemented; @@ -523,7 +523,7 @@ int __bch2_fsck_err(struct bch_fs *c, } goto print; - } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str_indented(out, ", shutting down\n" @@ -582,7 +582,7 @@ int __bch2_fsck_err(struct bch_fs *c, !(flags & FSCK_CAN_IGNORE))) ret = -BCH_ERR_fsck_errors_not_fixed; - if (test_bit(BCH_FS_fsck_running, &c->flags) && + if (test_bit(BCH_FS_in_fsck, &c->flags) && (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore)) { exiting = true; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 7c10325a1fac..4258c91e6df3 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3192,12 +3192,12 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->opts.fix_errors = FSCK_FIX_ask; c->opts.fsck = true; - set_bit(BCH_FS_fsck_running, &c->flags); + set_bit(BCH_FS_in_fsck, &c->flags); c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; int ret = bch2_run_online_recovery_passes(c); - clear_bit(BCH_FS_fsck_running, &c->flags); + clear_bit(BCH_FS_in_fsck, &c->flags); bch_err_fn(c, ret); c->stdio = NULL; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 375111b56029..b4242ad4899d 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -791,11 +791,11 @@ int bch2_fs_recovery(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (c->opts.fsck) - set_bit(BCH_FS_fsck_running, &c->flags); if (c->sb.clean) set_bit(BCH_FS_clean_recovery, &c->flags); - set_bit(BCH_FS_recovery_running, &c->flags); + if (c->opts.fsck) + set_bit(BCH_FS_in_fsck, &c->flags); + set_bit(BCH_FS_in_recovery, &c->flags); ret = bch2_blacklist_table_initialize(c); if (ret) { @@ -977,8 +977,8 @@ int bch2_fs_recovery(struct bch_fs *c) * multithreaded use: */ set_bit(BCH_FS_may_go_rw, &c->flags); - clear_bit(BCH_FS_fsck_running, &c->flags); - clear_bit(BCH_FS_recovery_running, &c->flags); + clear_bit(BCH_FS_in_fsck, &c->flags); + clear_bit(BCH_FS_in_recovery, &c->flags); /* in case we don't run journal replay, i.e. norecovery mode */ set_bit(BCH_FS_accounting_replay_done, &c->flags); From 177ac4925f4cbcfb672cb46f1d443a3d6c8a4b11 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 17:19:05 -0400 Subject: [PATCH 149/218] bcachefs: Don't rewind recovery if not in recovery Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 5f2bd8c10ce6..22cefffcf1fa 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -233,7 +233,8 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out, c->opts.recovery_passes |= BIT_ULL(pass); - if (c->curr_recovery_pass > pass) { + if (test_bit(BCH_FS_in_recovery, &c->flags) && + c->curr_recovery_pass > pass) { c->next_recovery_pass = pass; c->recovery_passes_complete &= (1ULL << pass) >> 1; return -BCH_ERR_restart_recovery; From 7ad7497862a2484333fa6a054aeab11a9b2f979b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 May 2025 14:54:07 -0400 Subject: [PATCH 150/218] bcachefs: add missing locking in bch2_write_point_to_text() Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 828cf94217dd..6aefa490ec24 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1517,6 +1517,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob; unsigned i; + mutex_lock(&wp->lock); + prt_printf(out, "%lu: ", wp->write_point); prt_human_readable_u64(out, wp->sectors_allocated << 9); @@ -1534,6 +1536,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, open_bucket_for_each(c, &wp->ptrs, ob, i) bch2_open_bucket_to_text(out, c, ob); printbuf_indent_sub(out, 2); + + mutex_unlock(&wp->lock); } void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) From ac4c7ac90eb7f5ea013b8842b2d803742f4484c0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 12 May 2025 15:14:19 -0400 Subject: [PATCH 151/218] bcachefs: Extra write buffer asserts Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.h | 2 ++ fs/bcachefs/btree_write_buffer.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 62d24b081e27..3a246610b673 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -143,6 +143,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr { kmsan_check_memory(k, bkey_bytes(&k->k)); + EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + if (unlikely(!btree_type_uses_write_buffer(btree))) { int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); dump_stack(); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 0094e4342b69..efb0c64d0aac 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -182,6 +182,8 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite return wb_flush_one_slowpath(trans, iter, wb); } + EBUG_ON(!bpos_eq(wb->k.k.p, path->pos)); + bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; return 0; From b42fac043f95512911f3e496585a0844747dc593 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 May 2025 10:53:23 -0400 Subject: [PATCH 152/218] bcachefs: bch2_fs_emergency_read_only2() More error message cleanup: instead of multiple printk()s per error, we want to be building up a single error message in a printbuf, so that it can be printed with indenting that shows grouping and avoid errors getting interspersed or lost in the log. This gets rid of most calls to bch2_fs_emergency_read_only(). We still have calls to - bch2_fatal_error() - bch2_fs_fatal_error() - bch2_fs_fatal_err_on() that need work. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 16 ++++++--- fs/bcachefs/btree_iter.c | 47 +++++++++++++++++-------- fs/bcachefs/btree_update_interior.c | 2 +- fs/bcachefs/data_update.c | 4 ++- fs/bcachefs/error.c | 17 ++++----- fs/bcachefs/error.h | 7 +++- fs/bcachefs/fs-ioctl.c | 14 +++++--- fs/bcachefs/journal_io.c | 18 +++++++--- fs/bcachefs/recovery.c | 13 +++++-- fs/bcachefs/super.c | 54 ++++++++++++++++++++++++----- fs/bcachefs/super.h | 2 ++ 11 files changed, 145 insertions(+), 49 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index a3250241e13e..97cd25cd492b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1766,23 +1766,31 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, NULL, &pick, -1); if (ret <= 0) { + bool ratelimit = true; struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); prt_str(&buf, "btree node read error: no device to read from\n at "); bch2_btree_pos_to_text(&buf, c, b); prt_newline(&buf); bch2_btree_lost_data(c, &buf, b->c.btree_id); - bch_err_ratelimited(c, "%s", buf.buf); if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) - bch2_fatal_error(c); + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology && + bch2_fs_emergency_read_only2(c, &buf)) + ratelimit = false; + + static DEFINE_RATELIMIT_STATE(rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (!ratelimit || __ratelimit(&rs)) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); set_btree_node_read_error(b); clear_btree_node_read_in_flight(b); smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); - printbuf_exit(&buf); return; } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 55f4169ce0c9..e0c1e873c886 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -16,6 +16,7 @@ #include "journal_io.h" #include "replicas.h" #include "snapshot.h" +#include "super.h" #include "trace.h" #include @@ -3449,29 +3450,45 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) return trans; } -static void check_btree_paths_leaked(struct btree_trans *trans) -{ #ifdef CONFIG_BCACHEFS_DEBUG - struct bch_fs *c = trans->c; + +static bool btree_paths_leaked(struct btree_trans *trans) +{ struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) if (path->ref) - goto leaked; - return; -leaked: - bch_err(c, "btree paths leaked from %s!", trans->fn); - trans_for_each_path(trans, path, i) - if (path->ref) - printk(KERN_ERR " btree %s %pS\n", - bch2_btree_id_str(path->btree_id), - (void *) path->ip_allocated); - /* Be noisy about this: */ - bch2_fatal_error(c); -#endif + return true; + return false; } +static void check_btree_paths_leaked(struct btree_trans *trans) +{ + if (btree_paths_leaked(trans)) { + struct bch_fs *c = trans->c; + struct btree_path *path; + unsigned i; + + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn); + trans_for_each_path(trans, path, i) + if (path->ref) + prt_printf(&buf, "btree %s %pS\n", + bch2_btree_id_str(path->btree_id), + (void *) path->ip_allocated); + + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } +} +#else +static inline void check_btree_paths_leaked(struct btree_trans *trans) {} +#endif + void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 3d25c2be035e..2d43d51b597d 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1813,10 +1813,10 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t __func__, b->c.level); bch2_btree_update_to_text(&buf, as); bch2_btree_path_to_text(&buf, trans, path_idx); + bch2_fs_emergency_read_only2(c, &buf); bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - bch2_fs_emergency_read_only(c); return -EIO; } diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 9b44f11fb0d9..e5909e54ad2e 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -348,6 +348,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, }); if (invalid) { struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); prt_str(&buf, "about to insert invalid key in data update path"); prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); @@ -358,10 +359,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, prt_str(&buf, "\nnew: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - bch2_fatal_error(c); ret = -BCH_ERR_invalid_bkey; goto out; } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 03567c559623..52f1108d5829 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -11,12 +11,12 @@ #define FSCK_ERR_RATELIMIT_NR 10 -void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) +void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out) { printbuf_indent_add_nextline(out, 2); #ifdef BCACHEFS_LOG_PREFIX - prt_printf(out, bch2_log_msg(c, "")); + prt_printf(out, "bcachefs (%s): ", fs_or_dev_name); #endif } @@ -29,9 +29,7 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) return false; case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: - if (bch2_fs_emergency_read_only(c)) - prt_printf(out, "inconsistency detected - emergency read only at journal seq %llu\n", - journal_cur_seq(&c->journal)); + bch2_fs_emergency_read_only2(c, out); return true; case BCH_ON_ERROR_panic: bch2_print_str(c, KERN_ERR, out->buf); @@ -151,14 +149,17 @@ void bch2_io_error_work(struct work_struct *work) bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, BCH_FORCE_IF_DEGRADED); + struct printbuf buf = PRINTBUF; + __bch2_log_msg_start(ca->name, &buf); - bch_err(ca, - "writes erroring for %u seconds, setting %s ro", + prt_printf(&buf, "writes erroring for %u seconds, setting %s ro", c->opts.write_error_timeout, dev ? "device" : "filesystem"); if (!dev) - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); } out: up_write(&c->state_lock); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index d89dd270b2e5..5123d4c86770 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -18,7 +18,12 @@ struct work_struct; /* Error messages: */ -void bch2_log_msg_start(struct bch_fs *, struct printbuf *); +void __bch2_log_msg_start(const char *, struct printbuf *); + +static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) +{ + __bch2_log_msg_start(c->name, out); +} /* * Inconsistency errors: The on disk data is inconsistent. If these occur during diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index a82dfce9e4ad..05361a793206 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -172,7 +172,10 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) if (get_user(flags, arg)) return -EFAULT; - bch_notice(c, "shutdown by ioctl type %u", flags); + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "shutdown by ioctl type %u", flags); switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: @@ -180,20 +183,23 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) if (ret) break; bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); bdev_thaw(c->vfs_sb->s_bdev); break; case FSOP_GOING_FLAGS_LOGFLUSH: bch2_journal_flush(&c->journal); fallthrough; case FSOP_GOING_FLAGS_NOLOGFLUSH: - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); break; default: ret = -EINVAL; - break; + goto noprint; } + bch2_print_str(c, KERN_ERR, buf.buf); +noprint: + printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index c593d77dc8f2..06f7b018492c 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1628,8 +1628,6 @@ static CLOSURE_CALLBACK(journal_write_done) : j->noflush_write_time, j->write_start_time); if (!w->devs_written.nr) { - if (!bch2_journal_error(j)) - bch_err(c, "unable to write journal to sufficient devices"); err = -BCH_ERR_journal_write_err; } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, @@ -1637,8 +1635,20 @@ static CLOSURE_CALLBACK(journal_write_done) err = bch2_mark_replicas(c, &replicas.e); } - if (err) - bch2_fatal_error(c); + if (err && !bch2_journal_error(j)) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + if (err == -BCH_ERR_journal_write_err) + prt_printf(&buf, "unable to write journal to sufficient devices"); + else + prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err)); + + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } closure_debug_destroy(cl); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b4242ad4899d..1895a6b13001 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1108,8 +1108,17 @@ int bch2_fs_recovery(struct bch_fs *c) return ret; err: fsck_err: - bch2_fs_emergency_read_only(c); - goto out; + { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret)); + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } + return ret; } int bch2_fs_initialize(struct bch_fs *c) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 8125c6804bd5..c46b2b2ebab1 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -438,6 +438,30 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) return ret; } +static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, + bool locked) +{ + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); + + if (!locked) + bch2_journal_halt(&c->journal); + else + bch2_journal_halt_locked(&c->journal); + bch2_fs_read_only_async(c); + wake_up(&bch2_read_only_wait); + + if (ret) + prt_printf(out, "emergency read only at seq %llu\n", + journal_cur_seq(&c->journal)); + + return ret; +} + +bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) +{ + return __bch2_fs_emergency_read_only2(c, out, false); +} + bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) { bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); @@ -2252,20 +2276,32 @@ static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) if (!ca) goto unlock; - if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) { + bool dev = bch2_dev_state_allowed(c, ca, + BCH_MEMBER_STATE_failed, + BCH_FORCE_IF_DEGRADED); + + if (!dev && sb) { + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + evict_inodes(sb); + } + + struct printbuf buf = PRINTBUF; + __bch2_log_msg_start(ca->name, &buf); + + prt_printf(&buf, "offline from block layer"); + + if (dev) { __bch2_dev_offline(c, ca); } else { - if (sb) { - if (!surprise) - sync_filesystem(sb); - shrink_dcache_sb(sb); - evict_inodes(sb); - } - bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); } + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + bch2_dev_put(ca); unlock: if (sb) diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index be75603fefe9..dc52f06cb2b9 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -32,6 +32,8 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); +bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *); + bool bch2_fs_emergency_read_only_locked(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); From 49188a9313e2209d064082378eafe5baf5a27bba Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 May 2025 17:01:49 -0400 Subject: [PATCH 153/218] bcachefs: kill move_bucket_in_flight Small cleanup/simplification, and prep work for the next patch, which will add checking if buckets don't get evacuated because they're missing backpointers. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 12 +++--- fs/bcachefs/move.h | 4 +- fs/bcachefs/move_types.h | 8 +--- fs/bcachefs/movinggc.c | 82 +++++++++++++++++++--------------------- 4 files changed, 49 insertions(+), 57 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 42076aa3438b..3a92eced2e67 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -67,7 +67,7 @@ static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) struct moving_io { struct list_head read_list; struct list_head io_list; - struct move_bucket_in_flight *b; + struct move_bucket *b; struct closure cl; bool read_completed; @@ -289,7 +289,7 @@ void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) } int bch2_move_extent(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, + struct move_bucket *bucket_in_flight, struct btree_iter *iter, struct bkey_s_c k, struct bch_io_opts io_opts, @@ -810,7 +810,7 @@ int bch2_move_data(struct bch_fs *c, } static int __bch2_move_data_phys(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, + struct move_bucket *bucket_in_flight, unsigned dev, u64 bucket_start, u64 bucket_end, @@ -1008,9 +1008,9 @@ static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, } int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts data_opts) + struct move_bucket *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts data_opts) { struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 0c620a5f728d..fb38383ffc7b 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -116,7 +116,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); int bch2_move_extent(struct moving_context *, - struct move_bucket_in_flight *, + struct move_bucket *, struct btree_iter *, struct bkey_s_c, struct bch_io_opts, @@ -143,7 +143,7 @@ int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, move_pred_fn, void *); int bch2_evacuate_bucket(struct moving_context *, - struct move_bucket_in_flight *, + struct move_bucket *, struct bpos, int, struct data_update_opts); int bch2_data_job(struct bch_fs *, diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index 807f779f6f76..c5c62cd600de 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -36,14 +36,10 @@ struct move_bucket_key { }; struct move_bucket { + struct move_bucket *next; + struct rhash_head hash; struct move_bucket_key k; unsigned sectors; -}; - -struct move_bucket_in_flight { - struct move_bucket_in_flight *next; - struct rhash_head hash; - struct move_bucket bucket; atomic_t count; }; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 66f4920552c5..dd07816401be 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -27,47 +27,36 @@ #include struct buckets_in_flight { - struct rhashtable table; - struct move_bucket_in_flight *first; - struct move_bucket_in_flight *last; - size_t nr; - size_t sectors; + struct rhashtable table; + struct move_bucket *first; + struct move_bucket *last; + size_t nr; + size_t sectors; }; static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket_in_flight, hash), - .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .head_offset = offsetof(struct move_bucket, hash), + .key_offset = offsetof(struct move_bucket, k), .key_len = sizeof(struct move_bucket_key), .automatic_shrinking = true, }; -static struct move_bucket_in_flight * -move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) +static int move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) { - struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); - int ret; - - if (!new) - return ERR_PTR(-ENOMEM); - - new->bucket = b; - - ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, - bch_move_bucket_params); - if (ret) { - kfree(new); - return ERR_PTR(ret); - } + int ret = rhashtable_lookup_insert_fast(&list->table, &b->hash, + bch_move_bucket_params); + if (ret) + return ret; if (!list->first) - list->first = new; + list->first = b; else - list->last->next = new; + list->last->next = b; - list->last = new; + list->last = b; list->nr++; - list->sectors += b.sectors; - return new; + list->sectors += b->sectors; + return 0; } static int bch2_bucket_is_movable(struct btree_trans *trans, @@ -111,7 +100,7 @@ static void move_buckets_wait(struct moving_context *ctxt, struct buckets_in_flight *list, bool flush) { - struct move_bucket_in_flight *i; + struct move_bucket *i; int ret; while ((i = list->first)) { @@ -126,7 +115,7 @@ static void move_buckets_wait(struct moving_context *ctxt, list->last = NULL; list->nr--; - list->sectors -= i->bucket.sectors; + list->sectors -= i->sectors; ret = rhashtable_remove_fast(&list->table, &i->hash, bch_move_bucket_params); @@ -143,7 +132,7 @@ static bool bucket_in_flight(struct buckets_in_flight *list, return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); } -typedef DARRAY(struct move_bucket) move_buckets; +typedef DARRAY(struct move_bucket *) move_buckets; static int bch2_copygc_get_buckets(struct moving_context *ctxt, struct buckets_in_flight *buckets_in_flight, @@ -184,9 +173,18 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, else if (bucket_in_flight(buckets_in_flight, b.k)) in_flight++; else { - ret2 = darray_push(buckets, b); + struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); + ret2 = b_i ? 0 : -ENOMEM; if (ret2) goto err; + + *b_i = b; + + ret2 = darray_push(buckets, b_i); + if (ret2) { + kfree(b_i); + goto err; + } sectors += b.sectors; } @@ -213,7 +211,6 @@ static int bch2_copygc(struct moving_context *ctxt, .btree_insert_flags = BCH_WATERMARK_copygc, }; move_buckets buckets = { 0 }; - struct move_bucket_in_flight *f; u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; @@ -226,26 +223,23 @@ static int bch2_copygc(struct moving_context *ctxt, if (kthread_should_stop() || freezing(current)) break; - f = move_bucket_in_flight_add(buckets_in_flight, *i); - ret = PTR_ERR_OR_ZERO(f); - if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ + struct move_bucket *b = *i; + *i = NULL; + + ret = move_bucket_in_flight_add(buckets_in_flight, b); + if (ret) { /* rare race: copygc_get_buckets returned same bucket more than once */ + kfree(b); ret = 0; continue; } - if (ret == -ENOMEM) { /* flush IO, continue later */ - ret = 0; - break; - } - ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, - f->bucket.k.gen, data_opts); + ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts); if (ret) goto err; *did_work = true; } err: - /* no entries in LRU btree found, or got to end: */ if (bch2_err_matches(ret, ENOENT)) ret = 0; @@ -257,6 +251,8 @@ static int bch2_copygc(struct moving_context *ctxt, sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); + darray_for_each(buckets, i) + kfree(*i); darray_exit(&buckets); return ret; } From fb7e78cc251bb931dea2b41bffbea344bdac5ddb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 May 2025 17:17:17 -0400 Subject: [PATCH 154/218] bcachefs: Move pending buckets queue to buckets_in_flight Signed-off-by: Kent Overstreet --- fs/bcachefs/movinggc.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index dd07816401be..83bd70b9a639 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -32,6 +32,8 @@ struct buckets_in_flight { struct move_bucket *last; size_t nr; size_t sectors; + + DARRAY(struct move_bucket *) to_evacuate; }; static const struct rhashtable_params bch_move_bucket_params = { @@ -132,11 +134,8 @@ static bool bucket_in_flight(struct buckets_in_flight *list, return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); } -typedef DARRAY(struct move_bucket *) move_buckets; - static int bch2_copygc_get_buckets(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight, - move_buckets *buckets) + struct buckets_in_flight *buckets_in_flight) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -180,7 +179,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, *b_i = b; - ret2 = darray_push(buckets, b_i); + ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); if (ret2) { kfree(b_i); goto err; @@ -188,14 +187,14 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, sectors += b.sectors; } - ret2 = buckets->nr >= nr_to_get; + ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; err: ret2; })); pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); + saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); return ret < 0 ? ret : 0; } @@ -210,16 +209,15 @@ static int bch2_copygc(struct moving_context *ctxt, struct data_update_opts data_opts = { .btree_insert_flags = BCH_WATERMARK_copygc, }; - move_buckets buckets = { 0 }; u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); + ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); if (ret) goto err; - darray_for_each(buckets, i) { + darray_for_each(buckets_in_flight->to_evacuate, i) { if (kthread_should_stop() || freezing(current)) break; @@ -249,11 +247,11 @@ static int bch2_copygc(struct moving_context *ctxt, sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; - trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); + trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved); - darray_for_each(buckets, i) + darray_for_each(buckets_in_flight->to_evacuate, i) kfree(*i); - darray_exit(&buckets); + darray_exit(&buckets_in_flight->to_evacuate); return ret; } From e4e513f2d51d3852a7af90d50c890815a8af70b5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 May 2025 17:19:10 -0400 Subject: [PATCH 155/218] bcachefs: move_buckets in rhashtable when allocated Signed-off-by: Kent Overstreet --- fs/bcachefs/movinggc.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 83bd70b9a639..cc843815f7eb 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -43,13 +43,8 @@ static const struct rhashtable_params bch_move_bucket_params = { .automatic_shrinking = true, }; -static int move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) +static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) { - int ret = rhashtable_lookup_insert_fast(&list->table, &b->hash, - bch_move_bucket_params); - if (ret) - return ret; - if (!list->first) list->first = b; else @@ -58,7 +53,6 @@ static int move_bucket_in_flight_add(struct buckets_in_flight *list, struct move list->last = b; list->nr++; list->sectors += b->sectors; - return 0; } static int bch2_bucket_is_movable(struct btree_trans *trans, @@ -98,12 +92,20 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, return ret; } +static void move_bucket_free(struct buckets_in_flight *list, + struct move_bucket *b) +{ + int ret = rhashtable_remove_fast(&list->table, &b->hash, + bch_move_bucket_params); + BUG_ON(ret); + kfree(b); +} + static void move_buckets_wait(struct moving_context *ctxt, struct buckets_in_flight *list, bool flush) { struct move_bucket *i; - int ret; while ((i = list->first)) { if (flush) @@ -119,10 +121,7 @@ static void move_buckets_wait(struct moving_context *ctxt, list->nr--; list->sectors -= i->sectors; - ret = rhashtable_remove_fast(&list->table, &i->hash, - bch_move_bucket_params); - BUG_ON(ret); - kfree(i); + move_bucket_free(list, i); } bch2_trans_unlock_long(ctxt->trans); @@ -184,6 +183,11 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, kfree(b_i); goto err; } + + ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash, + bch_move_bucket_params); + BUG_ON(ret2); + sectors += b.sectors; } @@ -224,12 +228,7 @@ static int bch2_copygc(struct moving_context *ctxt, struct move_bucket *b = *i; *i = NULL; - ret = move_bucket_in_flight_add(buckets_in_flight, b); - if (ret) { /* rare race: copygc_get_buckets returned same bucket more than once */ - kfree(b); - ret = 0; - continue; - } + move_bucket_in_flight_add(buckets_in_flight, b); ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts); if (ret) @@ -250,7 +249,8 @@ static int bch2_copygc(struct moving_context *ctxt, trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved); darray_for_each(buckets_in_flight->to_evacuate, i) - kfree(*i); + if (*i) + move_bucket_free(buckets_in_flight, *i); darray_exit(&buckets_in_flight->to_evacuate); return ret; } From c7378d0e5e23db8c7da8173d7961620078071796 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 May 2025 13:49:51 -0400 Subject: [PATCH 156/218] bcachefs: Add tracepoint, counter for io_move_created_rebalance Internal moves shouldn't add new rebalance_work, but it's been reported that this seems to be happening. Add a tracepoint and counter so we can see what's going on. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 61 ++++++++++++++++++++++---------- fs/bcachefs/sb-counters_format.h | 1 + fs/bcachefs/trace.h | 5 +++ 3 files changed, 48 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index e5909e54ad2e..c39ea51e9e48 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -346,7 +346,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, .btree = m->btree_id, .flags = BCH_VALIDATE_commit, }); - if (invalid) { + if (unlikely(invalid)) { struct printbuf buf = PRINTBUF; bch2_log_msg_start(c, &buf); @@ -368,6 +368,21 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, goto out; } + printbuf_reset(&journal_msg); + prt_str(&journal_msg, bch2_data_update_type_strs[m->type]); + + ret = bch2_trans_log_msg(trans, &journal_msg) ?: + bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: + bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, bkey_start_pos(&insert->k)) ?: + bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, insert->k.p) ?: + bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: + bch2_trans_update(trans, &iter, insert, + BTREE_UPDATE_internal_snapshot_node); + if (ret) + goto err; + if (trace_data_update_enabled()) { struct printbuf buf = PRINTBUF; @@ -382,30 +397,38 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, printbuf_exit(&buf); } - printbuf_reset(&journal_msg); - prt_str(&journal_msg, bch2_data_update_type_strs[m->type]); + if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > + bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) { + struct printbuf buf = PRINTBUF; - ret = bch2_trans_log_msg(trans, &journal_msg) ?: - bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: - bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, bkey_start_pos(&insert->k)) ?: - bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: - bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, &op->res, + bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_io_move_created_rebalance(c, buf.buf); + printbuf_exit(&buf); + + this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); + } + + ret = bch2_trans_commit(trans, &op->res, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| m->data_opts.btree_insert_flags); - if (!ret) { - bch2_btree_iter_set_pos(trans, &iter, next_pos); + if (ret) + goto err; - this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); - if (trace_io_move_finish_enabled()) - trace_io_move_finish2(m, &new->k_i, insert); - } + bch2_btree_iter_set_pos(trans, &iter, next_pos); + + this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); + if (trace_io_move_finish_enabled()) + trace_io_move_finish2(m, &new->k_i, insert); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index 5c4e5de79d81..7c0c9c842b4e 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -25,6 +25,7 @@ enum counters_flags { x(io_move_fail, 38, TYPE_COUNTER) \ x(io_move_write_fail, 82, TYPE_COUNTER) \ x(io_move_start_fail, 39, TYPE_COUNTER) \ + x(io_move_created_rebalance, 83, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ x(bucket_discard_fast, 79, TYPE_COUNTER) \ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 8c07189a080a..a31024f082f3 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1473,6 +1473,11 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); +DEFINE_EVENT(fs_str, io_move_created_rebalance, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + TRACE_EVENT(error_downcast, TP_PROTO(int bch_err, int std_err, unsigned long ip), TP_ARGS(bch_err, std_err, ip), From 648c1142c9f1ad914c9fd79cedbd6b92ac788cd6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 14 May 2025 10:44:21 -0400 Subject: [PATCH 157/218] bcachefs: fix can_write_extent() Failing to check the return value of bch2_dev_rcu(): we could (technically) race with device removal. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index c39ea51e9e48..de096ca65b4b 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -744,7 +744,9 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) rcu_read_lock(); unsigned nr_replicas = 0, i; for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { - struct bch_dev *ca = bch2_dev_rcu(c, i); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); + if (!ca) + continue; struct bch_dev_usage usage; bch2_dev_usage_read_fast(ca, &usage); From e882906929c55a9561641944d24c11dc3f338225 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 14 May 2025 17:58:00 -0400 Subject: [PATCH 158/218] bcachefs: Fix opt hooks in sysfs for non sb option We weren't checking if the option changed for non-superblock options - this led to rebalance not waking up when enabling the "rebalance_enabled" option. Signed-off-by: Kent Overstreet --- fs/bcachefs/sysfs.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 4c7d609d79fd..de7cda282a8c 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -642,7 +642,18 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, if (ret < 0) goto err; - bool changed = bch2_opt_set_sb(c, ca, opt, v); + bool is_sb = opt->get_sb || opt->get_member; + bool changed = false; + + if (is_sb) { + changed = bch2_opt_set_sb(c, ca, opt, v); + } else if (!ca) { + changed = bch2_opt_get_by_id(&c->opts, id) != v; + } else { + /* device options that aren't superblock options aren't + * supported */ + BUG(); + } if (!ca) bch2_opt_set_by_id(&c->opts, id, v); From 688321f97e0820024c259a066cd197e9e69cb8c8 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Thu, 15 May 2025 22:29:50 +0800 Subject: [PATCH 159/218] bcachefs: Kill BTREE_TRIGGER_bucket_invalidate Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 9 --------- fs/bcachefs/btree_types.h | 6 +----- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 81e2ae4bb400..51dfd2f24c20 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -914,15 +914,6 @@ int bch2_trigger_alloc(struct btree_trans *trans, goto err; } - if ((flags & BTREE_TRIGGER_bucket_invalidate) && - old_a->cached_sectors) { - ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, - -((s64) old_a->cached_sectors), - flags & BTREE_TRIGGER_gc); - if (ret) - goto err; - } - ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); if (ret) goto err; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 3acccca3b3a3..e5a965db68b4 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -258,9 +258,6 @@ struct btree_node_iter { * * BTREE_TRIGGER_insert - @new is entering the btree * BTREE_TRIGGER_overwrite - @old is leaving the btree - * - * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc - * trigger */ #define BTREE_TRIGGER_FLAGS() \ x(norun) \ @@ -270,8 +267,7 @@ struct btree_node_iter { x(gc) \ x(insert) \ x(overwrite) \ - x(is_root) \ - x(bucket_invalidate) + x(is_root) enum { #define x(n) BTREE_ITER_FLAG_BIT_##n, From 4a67b94bd816b56768fe06d880f02ae0bf6ceade Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 13 May 2025 02:44:26 +0800 Subject: [PATCH 160/218] bcachefs: Early return to avoid unnecessary lock Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 51dfd2f24c20..daf23d471d4f 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2576,19 +2576,18 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; - bool ret = false; for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list && - ob->dev == ca->dev_idx) - ret = true; - spin_unlock(&ob->lock); + scoped_guard(spinlock, &ob->lock) { + if (ob->valid && !ob->on_partial_list && + ob->dev == ca->dev_idx) + return true; + } } - return ret; + return false; } void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) From 123d2d09ff599ec11a01687f472c7bdc3b3b6f12 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 May 2025 08:31:02 -0400 Subject: [PATCH 161/218] bcachefs: bch2_inode_find_snapshot_root() Factor out a small common helper. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 28 +--------------------------- fs/bcachefs/inode.c | 24 ++++++++++++++++++++++++ fs/bcachefs/inode.h | 3 +++ fs/bcachefs/str_hash.c | 30 ++++++------------------------ 4 files changed, 34 insertions(+), 51 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 4258c91e6df3..e7cac5a69154 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1115,32 +1115,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans, return ret; } -static int get_snapshot_root_inode(struct btree_trans *trans, - struct bch_inode_unpacked *root, - u64 inum) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) - goto found_root; - } - if (ret) - goto err; - BUG(); -found_root: - ret = bch2_inode_unpack(k, root); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1171,7 +1145,7 @@ static int check_inode(struct btree_trans *trans, goto err; if (snapshot_root->bi_inum != u.bi_inum) { - ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); + ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); if (ret) goto err; } diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 96d4ab0148bf..a17a952ea161 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1131,6 +1131,30 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); } +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) { + ret = bch2_inode_unpack(k, root); + goto out; + } + } + /* We're only called when we know we have an inode for @inum */ + BUG_ON(!ret); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) { if (bi->bi_flags & BCH_INODE_unlinked) diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 5cfba9e98966..bb81b7c269bb 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -173,6 +173,9 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, struct bch_inode_unpacked *); +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root); + #define inode_opt_get(_c, _inode, _name) \ ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index 55a3a116b5a8..2d6379473ad4 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -146,34 +146,17 @@ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans struct bch_hash_info *hash_info) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) - goto found; - } - - /* This would've been caught by check_key_has_inode() */ - bch_err(c, "%s(): inum %llu not found", __func__, inum); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err; -found:; - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); + struct bch_inode_unpacked snapshot_root; + int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); if (ret) - goto err; + return ret; - struct bch_hash_info hash_root = bch2_hash_info_init(c, &inode); + struct bch_hash_info hash_root = bch2_hash_info_init(c, &snapshot_root); if (hash_info->type != hash_root.type || memcmp(&hash_info->siphash_key, &hash_root.siphash_key, sizeof(hash_root.siphash_key))) { - ret = repair_inode_hash_info(trans, &inode); + ret = repair_inode_hash_info(trans, &snapshot_root); if (!ret) { struct printbuf buf = PRINTBUF; prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", inum); @@ -190,8 +173,7 @@ found:; ret = -BCH_ERR_fsck_repair_unimplemented; } } -err: - bch2_trans_iter_exit(trans, &iter); + return ret; } From fdd0807f812204c36bfe71710c8a796731ee3777 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 May 2025 08:41:26 -0400 Subject: [PATCH 162/218] bcachefs: Improve bch2_repair_inode_hash_info() Improve this so it can be used by fsck.c check_inode(); it provides a much better error message than the check_inode() version. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 13 +++-- fs/bcachefs/str_hash.c | 112 ++++++++++++++++++++++++++++------------- fs/bcachefs/str_hash.h | 2 + 3 files changed, 86 insertions(+), 41 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index e7cac5a69154..2b7bc67dcdf8 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1150,13 +1150,12 @@ static int check_inode(struct btree_trans *trans, goto err; } - if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, - "inode hash info in different snapshots don't match")) { - u.bi_hash_seed = snapshot_root->bi_hash_seed; - SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); - do_update = true; + if (u.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { + ret = bch2_repair_inode_hash_info(trans, snapshot_root); + BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); + if (ret) + goto err; } if (u.bi_dir || u.bi_dir_offset) { diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index 2d6379473ad4..0cbf5508a32c 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -101,17 +101,25 @@ static noinline int hash_pick_winner(struct btree_trans *trans, } } -static int repair_inode_hash_info(struct btree_trans *trans, - struct bch_inode_unpacked *snapshot_root) +/* + * str_hash lookups across snapshots break in wild ways if hash_info in + * different snapshot versions doesn't match - so if we find one mismatch, check + * them all + */ +int bch2_repair_inode_hash_info(struct btree_trans *trans, + struct bch_inode_unpacked *snapshot_root) { + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + bool need_commit = false; int ret = 0; - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != snapshot_root->bi_inum) + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, + POS(0, snapshot_root->bi_inum), + BTREE_ITER_all_snapshots, k, ret) { + if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot))) break; if (!bkey_is_inode(k.k)) continue; @@ -121,19 +129,72 @@ static int repair_inode_hash_info(struct btree_trans *trans, if (ret) break; - if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, - "inode hash info in different snapshots don't match")) { + if (inode.bi_hash_seed == snapshot_root->bi_hash_seed && + INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root); + struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); + + BUG_ON(hash1.type != hash2.type || + memcmp(&hash1.siphash_key, + &hash2.siphash_key, + sizeof(hash1.siphash_key))); +#endif + continue; + } + + printbuf_reset(&buf); + prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n", + snapshot_root->bi_inum, + inode.bi_snapshot, + snapshot_root->bi_snapshot); + + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode)); + prt_printf(&buf, " %llx\n", inode.bi_hash_seed); + + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); + prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed); + + if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) { inode.bi_hash_seed = snapshot_root->bi_hash_seed; SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); - ret = __bch2_fsck_write_inode(trans, &inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - break; + + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + break; + need_commit = true; } } + + if (ret) + goto err; + + if (!need_commit) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", + snapshot_root->bi_inum); + + prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot); + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); + prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed); +#if 0 + prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); + bch2_prt_str_hash_type(&buf, hash_info->type); + prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); +#endif + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; + } + + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; +err: fsck_err: + printbuf_exit(&buf); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -145,34 +206,17 @@ static int repair_inode_hash_info(struct btree_trans *trans, static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, struct bch_hash_info *hash_info) { - struct bch_fs *c = trans->c; struct bch_inode_unpacked snapshot_root; int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); if (ret) return ret; - struct bch_hash_info hash_root = bch2_hash_info_init(c, &snapshot_root); + struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root); if (hash_info->type != hash_root.type || memcmp(&hash_info->siphash_key, &hash_root.siphash_key, - sizeof(hash_root.siphash_key))) { - ret = repair_inode_hash_info(trans, &snapshot_root); - if (!ret) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", inum); - - prt_printf(&buf, "root snapshot %u ", hash_root.inum_snapshot); - bch2_prt_str_hash_type(&buf, hash_root.type); - prt_printf(&buf, " %llx %llx\n", hash_root.siphash_key.k0, hash_root.siphash_key.k1); - - prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); - bch2_prt_str_hash_type(&buf, hash_info->type); - prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_fsck_repair_unimplemented; - } - } + sizeof(hash_root.siphash_key))) + ret = bch2_repair_inode_hash_info(trans, &snapshot_root); return ret; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index ae3154fb6a94..6762b3627e1b 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -394,6 +394,8 @@ int bch2_hash_delete(struct btree_trans *trans, return ret; } +int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); + struct snapshots_seen; int __bch2_str_hash_check_key(struct btree_trans *, struct snapshots_seen *, From bde41d9a58f159da8330b499cced04c40104f7f3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 May 2025 09:15:24 -0400 Subject: [PATCH 163/218] bcachefs: better error message for subvol_fs_path_parent_wrong Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 2b7bc67dcdf8..ab936520e0ae 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2079,7 +2079,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * 0, subvolume); ret = bkey_err(s.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; + goto err; if (ret) { if (fsck_err(trans, dirent_to_missing_subvol, @@ -2090,18 +2090,28 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto out; } - if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, - trans, subvol_fs_path_parent_wrong, - "subvol with wrong fs_path_parent, should be be %u\n%s", - parent_subvol, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); + if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { + printbuf_reset(&buf); + + prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", + parent_subvol); + + ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, + le64_to_cpu(s.v->inode) }, &buf); if (ret) goto err; + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, s.s_c); - n->v.fs_path_parent = cpu_to_le32(parent_subvol); + if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = cpu_to_le32(parent_subvol); + } } u64 target_inum = le64_to_cpu(s.v->inode); From 84b9f17195b2d4914763feee00ca44be42c9f8e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 May 2025 10:08:06 -0400 Subject: [PATCH 164/218] bcachefs: do_rebalance_scan() now only updates bch_extent_rebalance This ensures that our pending rebalance work accounting is accurate quickly. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 2 +- fs/bcachefs/move.h | 4 ++++ fs/bcachefs/rebalance.c | 42 ++++++++++++++++++++++++++--------------- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 3a92eced2e67..49898d5743d4 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -412,7 +412,7 @@ int bch2_move_extent(struct moving_context *ctxt, return ret; } -static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct per_snapshot_io_opts *io_opts, struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ struct btree_iter *extent_iter, diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index fb38383ffc7b..86b80499ac55 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -122,6 +122,10 @@ int bch2_move_extent(struct moving_context *, struct bch_io_opts, struct data_update_opts); +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bpos, + struct btree_iter *, struct bkey_s_c); + int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, move_pred_fn, void *, enum btree_id, unsigned); int __bch2_move_data(struct moving_context *, diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 8fefe2b174c2..c223bb092d33 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -459,22 +459,11 @@ static int do_rebalance_extent(struct moving_context *ctxt, return ret; } -static bool rebalance_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_only_specified_devs; - return data_opts->rewrite_ptrs != 0; -} - static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) { struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &trans->c->rebalance; - int ret; bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); ctxt->stats = &r->scan_stats; @@ -489,11 +478,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) r->state = BCH_REBALANCE_scanning; - ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, + r->scan_start.pos, r->scan_end.pos, + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents| + BTREE_ITER_prefetch, k, ({ + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + + struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, + &snapshot_io_opts, iter.pos, &iter, k); + PTR_ERR_OR_ZERO(io_opts); + })) ?: + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_move_stats_exit(&r->scan_stats, trans->c); + + /* + * Ensure that the rebalance_work entries we created are seen by the + * next iteration of do_rebalance(), so we don't end up stuck in + * rebalance_wait(): + */ + atomic64_inc(&r->scan_stats.sectors_seen); + bch2_btree_write_buffer_flush_sync(trans); + return ret; } From 8a6fa52e07bc8d2e5535e1e2c64b621d34fef9c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 17:21:00 -0400 Subject: [PATCH 165/218] bcachefs: relock_fail tracepoint now includes btree Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index a45cfae8f671..6663e186a960 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "btree_locking.h" #include "btree_types.h" @@ -742,7 +743,9 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str struct printbuf buf = PRINTBUF; bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); + prt_printf(&buf, " %s l=%u seq=%u node seq=", + bch2_btree_id_str(path->btree_id), + f->l, path->l[f->l].lock_seq); if (IS_ERR_OR_NULL(f->b)) { prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); } else { From a78a11900ecbb358710f65f78eee45e3b5691180 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 17:18:27 -0400 Subject: [PATCH 166/218] bcachefs: journal path now uses discard_opt_enabled() Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 15 +-------------- fs/bcachefs/bcachefs.h | 18 ++++++++++++++++++ fs/bcachefs/journal_reclaim.c | 3 ++- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index daf23d471d4f..5934104af7e6 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1798,19 +1798,6 @@ struct discard_buckets_state { u64 discarded; }; -/* - * This is needed because discard is both a filesystem option and a device - * option, and mount options are supposed to apply to that mount and not be - * persisted, i.e. if it's set as a mount option we can't propagate it to the - * device. - */ -static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) -{ - return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) - ? c->opts.discard - : ca->mi.discard; -} - static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, @@ -1874,7 +1861,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; *discard_pos_done = iter.pos; - if (discard_opt_enabled(c, ca) && !c->opts.nochanges) { + if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) { /* * This works without any other locks because this is the only * thread that removes items from the need_discard tree diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 27c025c05f8e..252fc1eaa0dc 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -452,6 +452,7 @@ BCH_DEBUG_PARAMS_ALL() x(btree_node_compact) \ x(btree_node_merge) \ x(btree_node_sort) \ + x(btree_node_get) \ x(btree_node_read) \ x(btree_node_read_done) \ x(btree_node_write) \ @@ -459,6 +460,10 @@ BCH_DEBUG_PARAMS_ALL() x(btree_interior_update_total) \ x(btree_gc) \ x(data_write) \ + x(data_write_to_submit) \ + x(data_write_to_queue) \ + x(data_write_to_btree_update) \ + x(data_write_btree_update) \ x(data_read) \ x(data_promote) \ x(journal_flush_write) \ @@ -1272,4 +1277,17 @@ static inline unsigned data_replicas_required(struct bch_fs *c) #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } +/* + * This is needed because discard is both a filesystem option and a device + * option, and mount options are supposed to apply to that mount and not be + * persisted, i.e. if it's set as a mount option we can't propagate it to the + * device. + */ +static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) +{ + return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) + ? c->opts.discard + : ca->mi.discard; +} + #endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ce9e0bd7ec4f..70f36f6bc482 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -300,7 +300,7 @@ void bch2_journal_do_discards(struct journal *j) while (should_discard_bucket(j, ja)) { if (!c->opts.nochanges && - ca->mi.discard && + bch2_discard_opt_enabled(c, ca) && bdev_max_discard_sectors(ca->disk_sb.bdev)) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, @@ -701,6 +701,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (ret) break; + /* XXX shove journal discards off to another thread */ bch2_journal_do_discards(j); seq_to_flush = journal_seq_to_flush(j); From 9469556a5fc1457d0a55f391010dfb82f7c5e20a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 May 2025 07:45:52 -0400 Subject: [PATCH 167/218] bcachefs: btree key cache asserts Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 669825f89cdd..b8efe2fddbc4 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -101,8 +101,8 @@ static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu kmem_cache_free(bch2_key_cache, ck); } -static void bkey_cached_free(struct btree_key_cache *bc, - struct bkey_cached *ck) +static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, + struct bkey_cached *ck) { kfree(ck->k); ck->k = NULL; @@ -116,6 +116,19 @@ static void bkey_cached_free(struct btree_key_cache *bc, this_cpu_inc(*bc->nr_pending); } +static void bkey_cached_free(struct btree_trans *trans, + struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + /* + * we'll hit strange issues in the SRCU code if we aren't holding an + * SRCU read lock... + */ + EBUG_ON(!trans->srcu_held); + + bkey_cached_free_noassert(bc, ck); +} + static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) { gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; @@ -281,7 +294,7 @@ static int btree_key_cache_create(struct btree_trans *trans, ck_path->uptodate = BTREE_ITER_UPTODATE; return 0; err: - bkey_cached_free(bc, ck); + bkey_cached_free(trans, bc, ck); mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); return ret; @@ -511,7 +524,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); if (bkey_cached_evict(&c->btree_key_cache, ck)) { - bkey_cached_free(&c->btree_key_cache, ck); + bkey_cached_free(trans, &c->btree_key_cache, ck); } else { six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); @@ -625,7 +638,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, } bkey_cached_evict(bc, ck); - bkey_cached_free(bc, ck); + bkey_cached_free(trans, bc, ck); mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); @@ -693,7 +706,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, } else if (!bkey_cached_lock_for_evict(ck)) { bc->skipped_lock_fail++; } else if (bkey_cached_evict(bc, ck)) { - bkey_cached_free(bc, ck); + bkey_cached_free_noassert(bc, ck); bc->freed++; freed++; } else { From 295dbf50e5f60cbc41416aff1feadd876d17e492 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 17:29:53 -0400 Subject: [PATCH 168/218] bcachefs: Optimize bch2_trans_start_alloc_update() Avoid doing more updates if we already have one. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 5934104af7e6..f8d21c12c3d1 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -479,11 +479,26 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos, + BTREE_ITER_with_updates| + BTREE_ITER_cached| + BTREE_ITER_intent); + int ret = bkey_err(k); + if (unlikely(ret)) return ERR_PTR(ret); + if ((void *) k.v >= trans->mem && + (void *) k.v < trans->mem + trans->mem_top) { + bch2_trans_iter_exit(trans, &iter); + return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v); + } + + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); + if (IS_ERR(a)) { + bch2_trans_iter_exit(trans, &iter); + return a; + } + ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); bch2_trans_iter_exit(trans, &iter); return unlikely(ret) ? ERR_PTR(ret) : a; From 878713b5f56aa4b3dce5bdb7f34952a10183b106 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 23:12:09 -0400 Subject: [PATCH 169/218] bcachefs: kill copy in bch2_disk_accounting_mod() Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 195dc3fcec1d..2786a684b0c8 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -94,19 +94,27 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + if (likely(!gc)) { + unsigned u64s = sizeof(struct bkey_i_accounting) / sizeof(u64) + nr; + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); + int ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; - accounting_key_init(&k_i.k, k, d, nr); + journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, BTREE_ID_accounting, 0, u64s); + accounting_key_init(e->start, k, d, nr); + return 0; + } else { + struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + + accounting_key_init(&k_i.k, k, d, nr); - if (unlikely(gc)) { int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); if (ret == -BCH_ERR_btree_insert_need_mark_replicas) ret = drop_locks_do(trans, bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); return ret; - } else { - return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); } } From 68708efcac711946ffeb1803eb54ebaf44675010 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 18:21:49 -0400 Subject: [PATCH 170/218] bcachefs: struct bch_fs_recovery bch_fs has gotten obnoxiously big, let's start organizing thins a bit better. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- fs/bcachefs/alloc_foreground.c | 6 +-- fs/bcachefs/backpointers.c | 4 +- fs/bcachefs/bcachefs.h | 17 +------- fs/bcachefs/btree_cache.c | 2 +- fs/bcachefs/btree_io.c | 6 +-- fs/bcachefs/btree_update_interior.c | 2 +- fs/bcachefs/fsck.c | 8 ++-- fs/bcachefs/movinggc.c | 2 +- fs/bcachefs/rebalance.c | 2 +- fs/bcachefs/recovery.c | 10 ++--- fs/bcachefs/recovery_passes.c | 60 +++++++++++++++-------------- fs/bcachefs/recovery_passes_types.h | 23 +++++++++++ fs/bcachefs/snapshot.c | 4 +- fs/bcachefs/super.c | 2 +- 15 files changed, 81 insertions(+), 69 deletions(-) create mode 100644 fs/bcachefs/recovery_passes_types.h diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index f8d21c12c3d1..4ae2aa6ea758 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -309,7 +309,7 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, "data type inconsistency"); bkey_fsck_err_on(!a.io_time[READ] && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, + c->recovery.curr_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 6aefa490ec24..76641cc4c27d 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -154,7 +154,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + if (c->recovery.curr_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) return false; return bch2_is_superblock_bucket(ca, b); @@ -524,7 +524,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, if (!avail) { if (req->watermark > BCH_WATERMARK_normal && - c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_allocations) goto alloc; if (cl && !waiting) { @@ -554,7 +554,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, goto alloc; } - if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + if (!ob && freespace && c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; } diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index bdf524b465fa..44da8e2657af 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -120,7 +120,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); bch_err(c, "%s", buf.buf); - } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + } else if (c->recovery.curr_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); @@ -136,7 +136,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); } - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && + if (c->recovery.curr_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && __bch2_inconsistent_error(c, &buf)) ret = -BCH_ERR_erofs_unfixed_errors; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 252fc1eaa0dc..1458f131af16 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -502,6 +502,7 @@ enum bch_time_stats { #include "keylist_types.h" #include "quota_types.h" #include "rebalance_types.h" +#include "recovery_passes_types.h" #include "replicas_types.h" #include "sb-members_types.h" #include "subvolume_types.h" @@ -1116,21 +1117,7 @@ struct bch_fs { /* RECOVERY */ u64 journal_replay_seq_start; u64 journal_replay_seq_end; - /* - * Two different uses: - * "Has this fsck pass?" - i.e. should this type of error be an - * emergency read-only - * And, in certain situations fsck will rewind to an earlier pass: used - * for signaling to the toplevel code which pass we want to run now. - */ - enum bch_recovery_pass curr_recovery_pass; - enum bch_recovery_pass next_recovery_pass; - /* bitmask of recovery passes that we actually ran */ - u64 recovery_passes_complete; - /* never rewinds version of curr_recovery_pass */ - enum bch_recovery_pass recovery_pass_done; - spinlock_t recovery_pass_lock; - struct semaphore run_recovery_passes_lock; + struct bch_fs_recovery recovery; /* DEBUG JUNK */ struct dentry *fs_debug_dir; diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 2fd58b08a54d..b1932b6a514b 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1003,7 +1003,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + if (c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_allocations) return; prt_printf(&buf, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 97cd25cd492b..e5db374f001b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -556,7 +556,7 @@ static int __btree_err(int ret, struct printbuf *err_msg, const char *fmt, ...) { - if (c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) + if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) return -BCH_ERR_fsck_fix; bool have_retry = false; @@ -1428,7 +1428,7 @@ static void btree_node_read_work(struct work_struct *work) if ((failed.nr || btree_node_need_rewrite(b)) && !btree_node_read_error(b) && - c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { + c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { prt_printf(&buf, " (rewriting node)"); bch2_btree_node_rewrite_async(c, b); } @@ -1776,7 +1776,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bch2_btree_lost_data(c, &buf, b->c.btree_id); if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology && + c->recovery.curr_pass > BCH_RECOVERY_PASS_check_topology && bch2_fs_emergency_read_only2(c, &buf)) ratelimit = false; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 2d43d51b597d..a658c97439ed 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2363,7 +2363,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bool now = false, pending = false; spin_lock(&c->btree_node_rewrites_lock); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && + if (c->recovery.curr_pass > BCH_RECOVERY_PASS_journal_replay && enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { list_add(&a->list, &c->btree_node_rewrites); now = true; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index ab936520e0ae..94a64816cb50 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3177,7 +3177,7 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->opts.fsck = true; set_bit(BCH_FS_in_fsck, &c->flags); - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + c->recovery.curr_pass = BCH_RECOVERY_PASS_check_alloc_info; int ret = bch2_run_online_recovery_passes(c); clear_bit(BCH_FS_in_fsck, &c->flags); @@ -3187,7 +3187,7 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->stdio_filter = NULL; c->opts.fix_errors = old_fix_errors; - up(&c->run_recovery_passes_lock); + up(&c->recovery.run_lock); bch2_ro_ref_put(c); return ret; } @@ -3211,7 +3211,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) if (!bch2_ro_ref_tryget(c)) return -EROFS; - if (down_trylock(&c->run_recovery_passes_lock)) { + if (down_trylock(&c->recovery.run_lock)) { bch2_ro_ref_put(c); return -EAGAIN; } @@ -3243,7 +3243,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) bch_err_fn(c, ret); if (thr) bch2_fsck_thread_exit(&thr->thr); - up(&c->run_recovery_passes_lock); + up(&c->recovery.run_lock); bch2_ro_ref_put(c); } return ret; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index cc843815f7eb..4bfdb1befb9a 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -362,7 +362,7 @@ static int bch2_copygc_thread(void *arg) * Data move operations can't run until after check_snapshots has * completed, and bch2_snapshot_is_ancestor() is available. */ - kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || kthread_should_stop()); bch2_move_stats_init(&move_stats, "copygc"); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index c223bb092d33..de1ec9e0caa0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -616,7 +616,7 @@ static int bch2_rebalance_thread(void *arg) * Data move operations can't run until after check_snapshots has * completed, and bch2_snapshot_is_ancestor() is available. */ - kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || kthread_should_stop()); bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 1895a6b13001..cd2372221a54 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -434,7 +434,7 @@ int bch2_journal_replay(struct bch_fs *c) trans = NULL; if (!c->opts.retain_recovery_info && - c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) + c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -1001,7 +1001,7 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); clear_bit(BCH_FS_errors_fixed, &c->flags); - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + c->recovery.curr_pass = BCH_RECOVERY_PASS_check_alloc_info; ret = bch2_run_recovery_passes(c); if (ret) @@ -1047,7 +1047,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.fsck && !test_bit(BCH_FS_error, &c->flags) && - c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && + c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && ext->btrees_lost_data) { ext->btrees_lost_data = 0; write_sb = true; @@ -1234,7 +1234,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; + c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; bch2_copygc_wakeup(c); bch2_rebalance_wakeup(c); @@ -1257,7 +1257,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; + c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; return 0; err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 22cefffcf1fa..c1eca55a1dde 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -210,16 +210,18 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out, struct bch_fs *c, enum bch_recovery_pass pass) { - if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) + struct bch_fs_recovery *r = &c->recovery; + + if (r->curr_pass == ARRAY_SIZE(recovery_pass_fns)) return -BCH_ERR_not_in_recovery; - if (c->recovery_passes_complete & BIT_ULL(pass)) + if (r->passes_complete & BIT_ULL(pass)) return 0; bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); if (pass < BCH_RECOVERY_PASS_set_may_go_rw && - c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { + r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { if (print) prt_printf(out, "need recovery pass %s (%u), but already rw\n", bch2_recovery_passes[pass], pass); @@ -229,14 +231,14 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out, if (print) prt_printf(out, "running explicit recovery pass %s (%u), currently at %s (%u)\n", bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + bch2_recovery_passes[r->curr_pass], r->curr_pass); c->opts.recovery_passes |= BIT_ULL(pass); if (test_bit(BCH_FS_in_recovery, &c->flags) && - c->curr_recovery_pass > pass) { - c->next_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; + r->curr_pass > pass) { + r->next_pass = pass; + r->passes_complete &= (1ULL << pass) >> 1; return -BCH_ERR_restart_recovery; } else { return 0; @@ -251,9 +253,9 @@ static int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *c, out->atomic++; unsigned long flags; - spin_lock_irqsave(&c->recovery_pass_lock, flags); + spin_lock_irqsave(&c->recovery.lock, flags); int ret = __bch2_run_explicit_recovery_pass(out, c, pass); - spin_unlock_irqrestore(&c->recovery_pass_lock, flags); + spin_unlock_irqrestore(&c->recovery.lock, flags); --out->atomic; return ret; @@ -361,7 +363,7 @@ int bch2_run_online_recovery_passes(struct bch_fs *c) int ret = bch2_run_recovery_pass(c, i); if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->curr_recovery_pass; + i = c->recovery.curr_pass; continue; } if (ret) @@ -381,26 +383,26 @@ int bch2_run_recovery_passes(struct bch_fs *c) */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - down(&c->run_recovery_passes_lock); - spin_lock_irq(&c->recovery_pass_lock); + down(&c->recovery.run_lock); + spin_lock_irq(&c->recovery.lock); - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { - unsigned prev_done = c->recovery_pass_done; - unsigned pass = c->curr_recovery_pass; + while (c->recovery.curr_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { + unsigned prev_done = c->recovery.pass_done; + unsigned pass = c->recovery.curr_pass; - c->next_recovery_pass = pass + 1; + c->recovery.next_pass = pass + 1; if (c->opts.recovery_pass_last && - c->curr_recovery_pass > c->opts.recovery_pass_last) + c->recovery.curr_pass > c->opts.recovery_pass_last) break; if (should_run_recovery_pass(c, pass)) { - spin_unlock_irq(&c->recovery_pass_lock); + spin_unlock_irq(&c->recovery.lock); ret = bch2_run_recovery_pass(c, pass) ?: bch2_journal_flush(&c->journal); - spin_lock_irq(&c->recovery_pass_lock); + spin_lock_irq(&c->recovery.lock); - if (c->next_recovery_pass < c->curr_recovery_pass) { + if (c->recovery.next_pass < c->recovery.curr_pass) { /* * bch2_run_explicit_recovery_pass() was called: we * can't always catch -BCH_ERR_restart_recovery because @@ -408,30 +410,30 @@ int bch2_run_recovery_passes(struct bch_fs *c) * node read completion) */ ret = 0; - c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); + c->recovery.passes_complete &= ~(~0ULL << c->recovery.curr_pass); } else { - c->recovery_passes_complete |= BIT_ULL(pass); - c->recovery_pass_done = max(c->recovery_pass_done, pass); + c->recovery.passes_complete |= BIT_ULL(pass); + c->recovery.pass_done = max(c->recovery.pass_done, pass); } } - c->curr_recovery_pass = c->next_recovery_pass; + c->recovery.curr_pass = c->recovery.next_pass; if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && - c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) { + c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots) { bch2_copygc_wakeup(c); bch2_rebalance_wakeup(c); } } - spin_unlock_irq(&c->recovery_pass_lock); - up(&c->run_recovery_passes_lock); + spin_unlock_irq(&c->recovery.lock); + up(&c->recovery.run_lock); return ret; } void bch2_fs_recovery_passes_init(struct bch_fs *c) { - spin_lock_init(&c->recovery_pass_lock); - sema_init(&c->run_recovery_passes_lock, 1); + spin_lock_init(&c->recovery.lock); + sema_init(&c->recovery.run_lock, 1); } diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h new file mode 100644 index 000000000000..69e8e29d58d0 --- /dev/null +++ b/fs/bcachefs/recovery_passes_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H +#define _BCACHEFS_RECOVERY_PASSES_TYPES_H + +struct bch_fs_recovery { + /* + * Two different uses: + * "Has this fsck pass?" - i.e. should this type of error be an + * emergency read-only + * And, in certain situations fsck will rewind to an earlier pass: used + * for signaling to the toplevel code which pass we want to run now. + */ + enum bch_recovery_pass curr_pass; + enum bch_recovery_pass next_pass; + /* never rewinds version of curr_pass */ + enum bch_recovery_pass pass_done; + /* bitmask of recovery passes that we actually ran */ + u64 passes_complete; + spinlock_t lock; + struct semaphore run_lock; +}; + +#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index c3dc450cbcec..c401d5285701 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -143,7 +143,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) rcu_read_lock(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { + if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) { ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); goto out; } @@ -348,7 +348,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) + if (c->recovery.curr_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } } else { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index c46b2b2ebab1..170b0f26c018 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -392,7 +392,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && - c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { + c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); From ab355520305ce4ab7331757c35b1042b32cae52e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 17:45:45 -0400 Subject: [PATCH 171/218] bcachefs: __bch2_run_recovery_passes() Consolidate bch2_run_recovery_passes() and bch2_run_online_recovery_passes(), prep work for automatically scheduling and running recovery passes in the background. - Now takes a mask of which passes to run, automatic background repair will pass in sb.recovery_passes_required. - Skips passes that are failing: a pass that failed may be reattempted after another pass succeeds (some passes depend on repair done by other passes for successful completion). - bch2_recovery_passes_match() helper to skip alloc passes on a filesystem without alloc info. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 2 +- fs/bcachefs/recovery.c | 7 +- fs/bcachefs/recovery_passes.c | 188 +++++++++++++++------------- fs/bcachefs/recovery_passes.h | 4 +- fs/bcachefs/recovery_passes_types.h | 2 + 5 files changed, 108 insertions(+), 95 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 94a64816cb50..0e223d4ae2ec 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3178,7 +3178,7 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) set_bit(BCH_FS_in_fsck, &c->flags); c->recovery.curr_pass = BCH_RECOVERY_PASS_check_alloc_info; - int ret = bch2_run_online_recovery_passes(c); + int ret = bch2_run_online_recovery_passes(c, ~0ULL); clear_bit(BCH_FS_in_fsck, &c->flags); bch_err_fn(c, ret); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index cd2372221a54..a7e6b5a6505a 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -966,7 +966,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; - ret = bch2_run_recovery_passes(c); + ret = bch2_run_recovery_passes(c, 0); if (ret) goto err; @@ -1001,9 +1001,8 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); clear_bit(BCH_FS_errors_fixed, &c->flags); - c->recovery.curr_pass = BCH_RECOVERY_PASS_check_alloc_info; - - ret = bch2_run_recovery_passes(c); + ret = bch2_run_recovery_passes(c, + BCH_RECOVERY_PASS_check_alloc_info); if (ret) goto err; diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index c1eca55a1dde..e0e261aa752e 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -203,6 +203,21 @@ static struct recovery_pass_fn recovery_pass_fns[] = { #undef x }; +static u64 bch2_recovery_passes_match(unsigned flags) +{ + u64 ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & flags) + ret |= BIT_ULL(i); + return ret; +} + +u64 bch2_fsck_recovery_passes(void) +{ + return bch2_recovery_passes_match(PASS_FSCK); +} + /* * For when we need to rewind recovery passes and run a pass we skipped: */ @@ -235,10 +250,12 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out, c->opts.recovery_passes |= BIT_ULL(pass); + if (test_bit(BCH_FS_in_recovery, &c->flags)) + r->passes_to_run |= BIT_ULL(pass); + if (test_bit(BCH_FS_in_recovery, &c->flags) && r->curr_pass > pass) { r->next_pass = pass; - r->passes_complete &= (1ULL << pass) >> 1; return -BCH_ERR_restart_recovery; } else { return 0; @@ -302,37 +319,9 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, return ret; } -u64 bch2_fsck_recovery_passes(void) -{ - u64 ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & PASS_FSCK) - ret |= BIT_ULL(i); - return ret; -} - -static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if ((p->when & PASS_ALLOC) && (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) - return false; - if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) - return false; - if (c->opts.recovery_passes & BIT_ULL(pass)) - return true; - if ((p->when & PASS_FSCK) && c->opts.fsck) - return true; - if ((p->when & PASS_UNCLEAN) && !c->sb.clean) - return true; - if (p->when & PASS_ALWAYS) - return true; - return false; -} - static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + struct bch_fs_recovery *r = &c->recovery; struct recovery_pass_fn *p = recovery_pass_fns + pass; if (!(p->when & PASS_SILENT)) @@ -341,8 +330,15 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) s64 start_time = ktime_get_real_seconds(); int ret = p->fn(c); - if (ret) + + r->passes_to_run &= ~BIT_ULL(pass); + + if (ret) { + r->passes_failing |= BIT_ULL(pass); return ret; + } + + r->passes_failing = 0; if (!test_bit(BCH_FS_error, &c->flags)) bch2_sb_recovery_pass_complete(c, pass, start_time); @@ -353,80 +349,96 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) return 0; } -int bch2_run_online_recovery_passes(struct bch_fs *c) +static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, + bool online) { - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { - struct recovery_pass_fn *p = recovery_pass_fns + i; + struct bch_fs_recovery *r = &c->recovery; + int ret = 0; - if (!(p->when & PASS_ONLINE)) - continue; + spin_lock_irq(&r->lock); - int ret = bch2_run_recovery_pass(c, i); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->recovery.curr_pass; - continue; + if (online) + orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); + + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) + orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); + + /* + * A failed recovery pass will be retried after another pass succeeds - + * but not this iteration. + * + * This is because some passes depend on repair done by other passes: we + * may want to retry, but we don't want to loop on failing passes. + */ + + orig_passes_to_run &= ~r->passes_failing; + + r->passes_to_run = orig_passes_to_run; + + while (r->passes_to_run) { + unsigned prev_done = r->pass_done; + unsigned pass = __ffs64(r->passes_to_run); + r->curr_pass = pass; + r->next_pass = r->curr_pass + 1; + r->passes_to_run &= ~BIT_ULL(pass); + + spin_unlock_irq(&r->lock); + + int ret2 = bch2_run_recovery_pass(c, pass) ?: + bch2_journal_flush(&c->journal); + + spin_lock_irq(&r->lock); + + if (r->next_pass < r->curr_pass) { + /* Rewind: */ + r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); + } else if (!ret2) { + r->pass_done = max(r->pass_done, pass); + r->passes_complete |= BIT_ULL(pass); + } else { + ret = ret2; + } + + if (ret && !online) + break; + + if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && + r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { + bch2_copygc_wakeup(c); + bch2_rebalance_wakeup(c); } - if (ret) - return ret; } - return 0; + spin_unlock_irq(&r->lock); + + return ret; } -int bch2_run_recovery_passes(struct bch_fs *c) +int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) { - int ret = 0; + return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); +} + +int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) +{ + u64 passes = + bch2_recovery_passes_match(PASS_ALWAYS) | + (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | + (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | + c->opts.recovery_passes | + c->sb.recovery_passes_required; /* * We can't allow set_may_go_rw to be excluded; that would cause us to * use the journal replay keys for updates where it's not expected. */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + passes &= ~c->opts.recovery_passes_exclude; + + passes &= ~(BIT_ULL(from) - 1); down(&c->recovery.run_lock); - spin_lock_irq(&c->recovery.lock); - - while (c->recovery.curr_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { - unsigned prev_done = c->recovery.pass_done; - unsigned pass = c->recovery.curr_pass; - - c->recovery.next_pass = pass + 1; - - if (c->opts.recovery_pass_last && - c->recovery.curr_pass > c->opts.recovery_pass_last) - break; - - if (should_run_recovery_pass(c, pass)) { - spin_unlock_irq(&c->recovery.lock); - ret = bch2_run_recovery_pass(c, pass) ?: - bch2_journal_flush(&c->journal); - spin_lock_irq(&c->recovery.lock); - - if (c->recovery.next_pass < c->recovery.curr_pass) { - /* - * bch2_run_explicit_recovery_pass() was called: we - * can't always catch -BCH_ERR_restart_recovery because - * it may have been called from another thread (btree - * node read completion) - */ - ret = 0; - c->recovery.passes_complete &= ~(~0ULL << c->recovery.curr_pass); - } else { - c->recovery.passes_complete |= BIT_ULL(pass); - c->recovery.pass_done = max(c->recovery.pass_done, pass); - } - } - - c->recovery.curr_pass = c->recovery.next_pass; - - if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && - c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots) { - bch2_copygc_wakeup(c); - bch2_rebalance_wakeup(c); - } - } - - spin_unlock_irq(&c->recovery.lock); + int ret = __bch2_run_recovery_passes(c, passes, false); up(&c->recovery.run_lock); return ret; diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 4c03472be5b9..0e79cc33fd8f 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -17,8 +17,8 @@ int __bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbu int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf *, enum bch_recovery_pass); -int bch2_run_online_recovery_passes(struct bch_fs *); -int bch2_run_recovery_passes(struct bch_fs *); +int bch2_run_online_recovery_passes(struct bch_fs *, u64); +int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); void bch2_fs_recovery_passes_init(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 69e8e29d58d0..deb6e0565cb9 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -14,8 +14,10 @@ struct bch_fs_recovery { enum bch_recovery_pass next_pass; /* never rewinds version of curr_pass */ enum bch_recovery_pass pass_done; + u64 passes_to_run; /* bitmask of recovery passes that we actually ran */ u64 passes_complete; + u64 passes_failing; spinlock_t lock; struct semaphore run_lock; }; From 7ed4c14e20be2b113e111ec9fa1803c778e00280 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 May 2025 17:36:55 -0400 Subject: [PATCH 172/218] bcachefs: Reduce usage of recovery.curr_pass We want recovery.curr_pass to be private to the recovery passes code, for better showing recovery pass status; also, it may rewind and is generally not the correct member to use. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 3 ++- fs/bcachefs/alloc_foreground.c | 6 +++--- fs/bcachefs/backpointers.c | 7 ++++--- fs/bcachefs/btree_cache.c | 2 +- fs/bcachefs/btree_io.c | 3 +-- fs/bcachefs/btree_update_interior.c | 2 +- fs/bcachefs/fsck.c | 1 - fs/bcachefs/snapshot.c | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 4ae2aa6ea758..88e710ba2685 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -309,7 +309,8 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, "data type inconsistency"); bkey_fsck_err_on(!a.io_time[READ] && - c->recovery.curr_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, + !(c->recovery.passes_to_run & + BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)), c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 76641cc4c27d..1a52c12c51ae 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -154,7 +154,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) { - if (c->recovery.curr_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) return false; return bch2_is_superblock_bucket(ca, b); @@ -524,7 +524,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, if (!avail) { if (req->watermark > BCH_WATERMARK_normal && - c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_allocations) + c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) goto alloc; if (cl && !waiting) { @@ -554,7 +554,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, goto alloc; } - if (!ob && freespace && c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; } diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 44da8e2657af..d9ddfc4b5dcc 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -104,6 +104,8 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + bool will_check = c->recovery.passes_to_run & + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); int ret = 0; if (insert) { @@ -120,7 +122,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); bch_err(c, "%s", buf.buf); - } else if (c->recovery.curr_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + } else if (!will_check) { prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); @@ -136,8 +138,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); } - if (c->recovery.curr_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && - __bch2_inconsistent_error(c, &buf)) + if (!will_check && __bch2_inconsistent_error(c, &buf)) ret = -BCH_ERR_erofs_unfixed_errors; bch_err(c, "%s", buf.buf); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index b1932b6a514b..8557cbd3d818 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1003,7 +1003,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; - if (c->recovery.curr_pass <= BCH_RECOVERY_PASS_check_allocations) + if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) return; prt_printf(&buf, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index e5db374f001b..34018296053a 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1775,8 +1775,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, prt_newline(&buf); bch2_btree_lost_data(c, &buf, b->c.btree_id); - if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - c->recovery.curr_pass > BCH_RECOVERY_PASS_check_topology && + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && bch2_fs_emergency_read_only2(c, &buf)) ratelimit = false; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index a658c97439ed..74e65714fecd 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2363,7 +2363,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bool now = false, pending = false; spin_lock(&c->btree_node_rewrites_lock); - if (c->recovery.curr_pass > BCH_RECOVERY_PASS_journal_replay && + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { list_add(&a->list, &c->btree_node_rewrites); now = true; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0e223d4ae2ec..2a7f418f3d87 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3177,7 +3177,6 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->opts.fsck = true; set_bit(BCH_FS_in_fsck, &c->flags); - c->recovery.curr_pass = BCH_RECOVERY_PASS_check_alloc_info; int ret = bch2_run_online_recovery_passes(c, ~0ULL); clear_bit(BCH_FS_in_fsck, &c->flags); diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index c401d5285701..24903e7de296 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -348,7 +348,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - if (c->recovery.curr_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) + if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } } else { From 06266465cc8a23ae037eb48ede9bdcd5eed8621c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 May 2025 18:23:41 -0400 Subject: [PATCH 173/218] bcachefs: bch2_recovery_pass_status_to_text() Show recovery pass status in sysfs - important now that we're running them automatically in the background. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 24 ++++++++++++++++++++++++ fs/bcachefs/recovery_passes.h | 2 ++ fs/bcachefs/sysfs.c | 6 ++++++ 3 files changed, 32 insertions(+) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index e0e261aa752e..02639b3d86b0 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -444,6 +444,30 @@ int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) return ret; } +static void prt_passes(struct printbuf *out, const char *msg, u64 passes) +{ + prt_printf(out, "%s:\t", msg); + prt_bitflags(out, bch2_recovery_passes, passes); + prt_newline(out); +} + +void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_recovery *r = &c->recovery; + + printbuf_tabstop_push(out, 32); + prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); + prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & + bch2_recovery_passes_match(PASS_ONLINE)); + prt_passes(out, "Complete passes", r->passes_complete); + prt_passes(out, "Failing passes", r->passes_failing); + + if (r->curr_pass) { + prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); + prt_passes(out, "Current passes", r->passes_to_run); + } +} + void bch2_fs_recovery_passes_init(struct bch_fs *c) { spin_lock_init(&c->recovery.lock); diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 0e79cc33fd8f..8c90e29cd6cb 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -20,6 +20,8 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf int bch2_run_online_recovery_passes(struct bch_fs *, u64); int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); +void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *); + void bch2_fs_recovery_passes_init(struct bch_fs *); #endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index de7cda282a8c..1a55196d69f1 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -35,6 +35,7 @@ #include "nocow_locking.h" #include "opts.h" #include "rebalance.h" +#include "recovery_passes.h" #include "replicas.h" #include "super-io.h" #include "tests.h" @@ -200,6 +201,7 @@ read_attribute(copy_gc_wait); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); read_attribute(snapshot_delete_status); +read_attribute(recovery_status); read_attribute(new_stripes); @@ -325,6 +327,9 @@ SHOW(bch2_fs) if (attr == &sysfs_snapshot_delete_status) bch2_snapshot_delete_status_to_text(out, c); + if (attr == &sysfs_recovery_status) + bch2_recovery_pass_status_to_text(out, c); + /* Debugging: */ if (attr == &sysfs_journal_debug) @@ -475,6 +480,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_rebalance_status, &sysfs_snapshot_delete_status, + &sysfs_recovery_status, &sysfs_compression_stats, From d4b30ed90c778bd5612fc82c2a5536de66d95184 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 14 May 2025 15:54:20 -0400 Subject: [PATCH 174/218] bcachefs: bch2_run_explicit_recovery_pass() cleanup Consolidate the run_explicit_recovery_pass() interfaces by adding a flags parameter; this will also let us add a RUN_RECOVERY_PASS_ratelimit flag. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_node_scan.c | 2 +- fs/bcachefs/buckets.c | 8 +- fs/bcachefs/errcode.h | 1 - fs/bcachefs/error.c | 2 +- fs/bcachefs/recovery.c | 31 ++++--- fs/bcachefs/recovery_passes.c | 154 +++++++++++++++++++--------------- fs/bcachefs/recovery_passes.h | 16 ++-- fs/bcachefs/sb-members.c | 4 +- fs/bcachefs/subvolume.c | 6 +- 9 files changed, 122 insertions(+), 102 deletions(-) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 7bd13438d5ef..5a97a6b8a757 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -541,7 +541,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, struct find_btree_nodes *f = &c->found_btree_nodes; - int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); if (ret) return ret; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 8d6955ef631b..ca6e58d6fbc8 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -399,8 +399,8 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf bool print = __bch2_count_fsck_err(c, id, buf); - int ret = bch2_run_explicit_recovery_pass_persistent(c, buf, - BCH_RECOVERY_PASS_check_allocations); + int ret = bch2_run_explicit_recovery_pass(c, buf, + BCH_RECOVERY_PASS_check_allocations, 0); if (insert) { bch2_trans_updates_to_text(buf, trans); @@ -972,8 +972,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); - bch2_run_explicit_recovery_pass_persistent(c, &buf, - BCH_RECOVERY_PASS_check_allocations); + bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_allocations, 0); if (print) bch2_print_str(c, KERN_ERR, buf.buf); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 4aac0182cbed..62843e772b2c 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -183,7 +183,6 @@ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ x(EINVAL, restart_recovery) \ - x(EINVAL, not_in_recovery) \ x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ x(BCH_ERR_data_update_done, data_update_done_would_block) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 52f1108d5829..a476dd2c196e 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -102,7 +102,7 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) __bch2_inconsistent_error(c, out); return -BCH_ERR_btree_need_topology_repair; } else { - return bch2_run_explicit_recovery_pass_persistent(c, out, BCH_RECOVERY_PASS_check_topology) ?: + return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: -BCH_ERR_btree_node_read_validate_error; } } diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index a7e6b5a6505a..0f954567ea45 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -52,24 +52,24 @@ int bch2_btree_lost_data(struct bch_fs *c, } /* Once we have runtime self healing for topology errors we won't need this: */ - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_topology) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; /* Btree node accounting will be off: */ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; #ifdef CONFIG_BCACHEFS_DEBUG /* * These are much more minor, and don't need to be corrected right away, * but in debug mode we want the next fsck run to be clean: */ - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_lrus) ?: ret; - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret; #endif switch (btree) { case BTREE_ID_alloc: - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); @@ -79,30 +79,30 @@ int bch2_btree_lost_data(struct bch_fs *c, __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); goto out; case BTREE_ID_backpointers: - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret; goto out; case BTREE_ID_need_discard: - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_freespace: - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_bucket_gens: - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_lru: - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_accounting: - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; goto out; case BTREE_ID_snapshots: - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; default: - ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; } out: @@ -978,7 +978,6 @@ int bch2_fs_recovery(struct bch_fs *c) */ set_bit(BCH_FS_may_go_rw, &c->flags); clear_bit(BCH_FS_in_fsck, &c->flags); - clear_bit(BCH_FS_in_recovery, &c->flags); /* in case we don't run journal replay, i.e. norecovery mode */ set_bit(BCH_FS_accounting_replay_done, &c->flags); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 02639b3d86b0..b931a9b465d4 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -218,107 +218,122 @@ u64 bch2_fsck_recovery_passes(void) return bch2_recovery_passes_match(PASS_FSCK); } +static bool recovery_pass_needs_set(struct bch_fs *c, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags flags) +{ + struct bch_fs_recovery *r = &c->recovery; + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool persistent = !in_recovery || !(flags & RUN_RECOVERY_PASS_nopersistent); + + /* + * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do + * anything if the pass has already run: these mean we need a prior pass + * to run before we continue to repair, we don't expect that pass to fix + * the damage we encountered. + * + * Otherwise, we run run_explicit_recovery_pass when we find damage, so + * it should run again even if it's already run: + */ + + return persistent + ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) + : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass)); +} + /* * For when we need to rewind recovery passes and run a pass we skipped: */ -static int __bch2_run_explicit_recovery_pass(struct printbuf *out, - struct bch_fs *c, - enum bch_recovery_pass pass) +int __bch2_run_explicit_recovery_pass(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags flags) { struct bch_fs_recovery *r = &c->recovery; + int ret = 0; - if (r->curr_pass == ARRAY_SIZE(recovery_pass_fns)) - return -BCH_ERR_not_in_recovery; + lockdep_assert_held(&c->sb_lock); - if (r->passes_complete & BIT_ULL(pass)) - return 0; + bch2_printbuf_make_room(out, 1024); + out->atomic++; - bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); + unsigned long lockflags; + spin_lock_irqsave(&r->lock, lockflags); - if (pass < BCH_RECOVERY_PASS_set_may_go_rw && - r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { - if (print) - prt_printf(out, "need recovery pass %s (%u), but already rw\n", - bch2_recovery_passes[pass], pass); - return -BCH_ERR_cannot_rewind_recovery; + if (!recovery_pass_needs_set(c, pass, flags)) + goto out; + + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool rewind = in_recovery && r->curr_pass > pass; + + if ((flags & RUN_RECOVERY_PASS_nopersistent) && in_recovery) { + r->passes_to_run |= BIT_ULL(pass); + } else { + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); } - if (print) - prt_printf(out, "running explicit recovery pass %s (%u), currently at %s (%u)\n", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[r->curr_pass], r->curr_pass); + if (pass < BCH_RECOVERY_PASS_set_may_go_rw && + (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { + prt_printf(out, "need recovery pass %s (%u), but already rw\n", + bch2_recovery_passes[pass], pass); + ret = -BCH_ERR_cannot_rewind_recovery; + goto out; + } - c->opts.recovery_passes |= BIT_ULL(pass); + prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[r->curr_pass], r->curr_pass, + rewind ? " - rewinding" : ""); if (test_bit(BCH_FS_in_recovery, &c->flags)) r->passes_to_run |= BIT_ULL(pass); - if (test_bit(BCH_FS_in_recovery, &c->flags) && - r->curr_pass > pass) { + if (rewind) { r->next_pass = pass; - return -BCH_ERR_restart_recovery; - } else { - return 0; + r->passes_complete &= (1ULL << pass) >> 1; + ret = -BCH_ERR_restart_recovery; } -} - -static int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass) -{ - bch2_printbuf_make_room(out, 1024); - out->atomic++; - - unsigned long flags; - spin_lock_irqsave(&c->recovery.lock, flags); - int ret = __bch2_run_explicit_recovery_pass(out, c, pass); - spin_unlock_irqrestore(&c->recovery.lock, flags); - +out: + spin_unlock_irqrestore(&r->lock, lockflags); --out->atomic; return ret; } int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) + struct printbuf *out, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags flags) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - unsigned len = buf.pos; - - int ret = bch2_run_explicit_recovery_pass_printbuf(c, &buf, pass); - - if (len != buf.pos) - bch2_print_str(c, KERN_NOTICE, buf.buf); - printbuf_exit(&buf); - return ret; -} - -int __bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass) -{ - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); - - return bch2_run_explicit_recovery_pass_printbuf(c, out, pass); -} - -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass) -{ - if (c->sb.recovery_passes_required & BIT_ULL(pass)) + if (!recovery_pass_needs_set(c, pass, flags)) return 0; mutex_lock(&c->sb_lock); - int ret = __bch2_run_explicit_recovery_pass_persistent(c, out, pass); + int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); + bch2_write_super(c); mutex_unlock(&c->sb_lock); return ret; } +int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + if (!recovery_pass_needs_set(c, pass, RUN_RECOVERY_PASS_nopersistent)) + return 0; + + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + mutex_lock(&c->sb_lock); + int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, + RUN_RECOVERY_PASS_nopersistent); + mutex_unlock(&c->sb_lock); + + bch2_print_str(c, KERN_NOTICE, buf.buf); + printbuf_exit(&buf); + return ret; +} + static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { struct bch_fs_recovery *r = &c->recovery; @@ -409,6 +424,7 @@ static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, } } + clear_bit(BCH_FS_in_recovery, &c->flags); spin_unlock_irq(&r->lock); return ret; diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 8c90e29cd6cb..30f896479a52 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -10,12 +10,18 @@ u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); -int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); +enum bch_run_recovery_pass_flags { + RUN_RECOVERY_PASS_nopersistent = BIT(0), +}; -int __bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass); -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass); +int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); + +int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass, + enum bch_run_recovery_pass_flags); +int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass, + enum bch_run_recovery_pass_flags); int bch2_run_online_recovery_passes(struct bch_fs *, u64); int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 75184d8e685a..3398906660a5 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -20,8 +20,8 @@ int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); - int ret = bch2_run_explicit_recovery_pass_persistent(c, &buf, - BCH_RECOVERY_PASS_check_allocations); + int ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_allocations, 0); if (print) bch2_print_str(c, KERN_ERR, buf.buf); diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 3c6ba1469de2..35c9f86a73c1 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -23,8 +23,8 @@ static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) prt_printf(&buf, "missing subvolume %u", subvolid); bool print = bch2_count_fsck_err(c, subvol_missing, &buf); - int ret = bch2_run_explicit_recovery_pass_persistent(c, &buf, - BCH_RECOVERY_PASS_check_inodes); + int ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_inodes, 0); if (print) bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); @@ -62,7 +62,7 @@ static int check_subvol(struct btree_trans *trans, ret = bch2_snapshot_lookup(trans, snapid, &snapshot); if (bch2_err_matches(ret, ENOENT)) - return bch2_run_explicit_recovery_pass(c, + return bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; if (ret) return ret; From 06977ea82b5df669c833399b4b8e2f163a8bcfbc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 May 2025 23:28:01 -0400 Subject: [PATCH 175/218] bcachefs: Run recovery passes asynchronously When we request a recovery pass to be run online, i.e. not during recovery, if it's an online pass it'll now be run in the background, instead of waiting for the next mount. To avoid situations where recovery passes are running continuously, this also includes ratelimiting: if the RUN_RECOVERY_PASS_ratelimit flag is passed, the pass may be deferred until later - depending on the runtime and last run stats in the recovery_passes superblock section. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 3 +- fs/bcachefs/recovery_passes.c | 133 ++++++++++++++++++++++------ fs/bcachefs/recovery_passes.h | 1 + fs/bcachefs/recovery_passes_types.h | 2 + 4 files changed, 113 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 1458f131af16..e1680b635fe1 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -760,7 +760,8 @@ struct btree_trans_buf { x(snapshot_delete_pagecache) \ x(sysfs) \ x(btree_write_buffer) \ - x(btree_node_scrub) + x(btree_node_scrub) \ + x(async_recovery_passes) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index b931a9b465d4..f74f14227137 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -138,6 +138,30 @@ static void bch2_sb_recovery_pass_complete(struct bch_fs *c, mutex_unlock(&c->sb_lock); } +static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); + bool ret = false; + + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_recovery_passes *r = + bch2_sb_field_get(c->disk_sb.sb, recovery_passes); + + if (stable < recovery_passes_nr_entries(r)) { + struct recovery_pass_entry *i = r->start + stable; + + /* + * Ratelimit if the last runtime was more than 1% of the time + * since we last ran + */ + ret = (u64) le32_to_cpu(i->last_runtime) * 100 > + ktime_get_real_seconds() - le64_to_cpu(i->last_run); + } + + return ret; +} + const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { .validate = bch2_sb_recovery_passes_validate, .to_text = bch2_sb_recovery_passes_to_text @@ -218,13 +242,33 @@ u64 bch2_fsck_recovery_passes(void) return bch2_recovery_passes_match(PASS_FSCK); } +static void bch2_run_async_recovery_passes(struct bch_fs *c) +{ + if (!down_trylock(&c->recovery.run_lock)) + return; + + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) + goto unlock; + + if (queue_work(system_long_wq, &c->recovery.work)) + return; + + enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); +unlock: + up(&c->recovery.run_lock); +} + static bool recovery_pass_needs_set(struct bch_fs *c, enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags flags) + enum bch_run_recovery_pass_flags *flags) { struct bch_fs_recovery *r = &c->recovery; bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool persistent = !in_recovery || !(flags & RUN_RECOVERY_PASS_nopersistent); + bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); + + if ((*flags & RUN_RECOVERY_PASS_ratelimit) && + !bch2_recovery_pass_want_ratelimit(c, pass)) + *flags &= ~RUN_RECOVERY_PASS_ratelimit; /* * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do @@ -236,9 +280,16 @@ static bool recovery_pass_needs_set(struct bch_fs *c, * it should run again even if it's already run: */ - return persistent - ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) - : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass)); + if (persistent + ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) + : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) + return true; + + if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && + (r->passes_ratelimiting & BIT_ULL(pass))) + return true; + + return false; } /* @@ -260,15 +311,14 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, unsigned long lockflags; spin_lock_irqsave(&r->lock, lockflags); - if (!recovery_pass_needs_set(c, pass, flags)) + if (!recovery_pass_needs_set(c, pass, &flags)) goto out; bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); bool rewind = in_recovery && r->curr_pass > pass; + bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; - if ((flags & RUN_RECOVERY_PASS_nopersistent) && in_recovery) { - r->passes_to_run |= BIT_ULL(pass); - } else { + if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) { struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); } @@ -281,18 +331,32 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, goto out; } - prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[r->curr_pass], r->curr_pass, - rewind ? " - rewinding" : ""); + if (ratelimit) + r->passes_ratelimiting |= BIT_ULL(pass); + else + r->passes_ratelimiting &= ~BIT_ULL(pass); + + if (in_recovery && !ratelimit) { + prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[r->curr_pass], r->curr_pass, + rewind ? " - rewinding" : ""); - if (test_bit(BCH_FS_in_recovery, &c->flags)) r->passes_to_run |= BIT_ULL(pass); - if (rewind) { - r->next_pass = pass; - r->passes_complete &= (1ULL << pass) >> 1; - ret = -BCH_ERR_restart_recovery; + if (rewind) { + r->next_pass = pass; + r->passes_complete &= (1ULL << pass) >> 1; + ret = -BCH_ERR_restart_recovery; + } + } else { + prt_printf(out, "scheduling recovery pass %s (%u)%s\n", + bch2_recovery_passes[pass], pass, + ratelimit ? " - ratelimiting" : ""); + + struct recovery_pass_fn *p = recovery_pass_fns + pass; + if (p->when & PASS_ONLINE) + bch2_run_async_recovery_passes(c); } out: spin_unlock_irqrestore(&r->lock, lockflags); @@ -305,20 +369,24 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass, enum bch_run_recovery_pass_flags flags) { - if (!recovery_pass_needs_set(c, pass, flags)) - return 0; + int ret = 0; - mutex_lock(&c->sb_lock); - int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) { + if (!recovery_pass_needs_set(c, pass, &flags)) + return 0; + + ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); + bch2_write_super(c); + } return ret; } int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - if (!recovery_pass_needs_set(c, pass, RUN_RECOVERY_PASS_nopersistent)) + enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent; + + if (!recovery_pass_needs_set(c, pass, &flags)) return 0; struct printbuf buf = PRINTBUF; @@ -430,6 +498,19 @@ static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, return ret; } +static void bch2_async_recovery_passes_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); + struct bch_fs_recovery *r = &c->recovery; + + __bch2_run_recovery_passes(c, + c->sb.recovery_passes_required & ~r->passes_ratelimiting, + true); + + up(&r->run_lock); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); +} + int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) { return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); @@ -488,4 +569,6 @@ void bch2_fs_recovery_passes_init(struct bch_fs *c) { spin_lock_init(&c->recovery.lock); sema_init(&c->recovery.run_lock, 1); + + INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); } diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 30f896479a52..dc0d2014ff9b 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -12,6 +12,7 @@ u64 bch2_fsck_recovery_passes(void); enum bch_run_recovery_pass_flags { RUN_RECOVERY_PASS_nopersistent = BIT(0), + RUN_RECOVERY_PASS_ratelimit = BIT(1), }; int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index deb6e0565cb9..aa9526938cc3 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -18,8 +18,10 @@ struct bch_fs_recovery { /* bitmask of recovery passes that we actually ran */ u64 passes_complete; u64 passes_failing; + u64 passes_ratelimiting; spinlock_t lock; struct semaphore run_lock; + struct work_struct work; }; #endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ From 15f969326ee296f7b7faf7704105a99fa02c288d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 May 2025 15:05:26 -0400 Subject: [PATCH 176/218] bcachefs: Improve bucket_bitmap code Add some more helpers, and mismatches is now a superset of the empty bitmap - simplifies most checks. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 123 ++++++++++++++++++++++--------------- fs/bcachefs/backpointers.h | 7 +++ fs/bcachefs/bcachefs.h | 3 +- fs/bcachefs/buckets.c | 25 ++------ fs/bcachefs/movinggc.c | 6 +- fs/bcachefs/super.c | 8 +-- 6 files changed, 92 insertions(+), 80 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index d9ddfc4b5dcc..6b98ce1ed6c9 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -15,6 +15,8 @@ #include +static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64); + static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) { return (struct bbpos) { @@ -685,31 +687,28 @@ static int check_extent_to_backpointers(struct btree_trans *trans, continue; } - u64 b = PTR_BUCKET_NR(ca, &p.ptr); - bool set[2]; - - for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) { - unsigned long *bitmap = - READ_ONCE(ca->bucket_backpointer_mismatches[i].buckets); - set[i] = bitmap && test_bit(b, bitmap); + if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) { + rcu_read_unlock(); + continue; } - bool check = set[0]; - bool empty = set[1]; + u64 b = PTR_BUCKET_NR(ca, &p.ptr); + if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) { + rcu_read_unlock(); + continue; + } - bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); + bool empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); rcu_read_unlock(); - if ((check || empty) && !stale) { - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); - int ret = check - ? check_bp_exists(trans, s, &bp, k) - : bch2_bucket_backpointer_mod(trans, k, &bp, true); - if (ret) - return ret; - } + int ret = !empty + ? check_bp_exists(trans, s, &bp, k) + : bch2_bucket_backpointer_mod(trans, k, &bp, true); + if (ret) + return ret; } return 0; @@ -952,21 +951,12 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b sectors[ALLOC_stripe] + sectors[ALLOC_cached]) == 0; - struct bucket_bitmap *bitmap = &ca->bucket_backpointer_mismatches[empty]; - - mutex_lock(&bitmap->lock); - if (!bitmap->buckets) { - bitmap->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), GFP_KERNEL); - if (!bitmap->buckets) { - mutex_unlock(&bitmap->lock); - ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; - goto err; - } - } - - bitmap->nr += !__test_and_set_bit(alloc_k.k->p.offset, bitmap->buckets); - mutex_unlock(&bitmap->lock); + ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch, + alloc_k.k->p.offset) ?: + (empty + ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty, + alloc_k.k->p.offset) + : 0); } err: bch2_dev_put(ca); @@ -992,15 +982,10 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) struct bpos bucket = bp_pos_to_bucket(ca, pos); u64 next = ca->mi.nbuckets; - for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) { - unsigned long *bitmap = - READ_ONCE(ca->bucket_backpointer_mismatches[i].buckets); - if (bitmap) - next = min_t(u64, next, - find_next_bit(bitmap, - ca->mi.nbuckets, - bucket.offset)); - } + unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); + if (bitmap) + next = min_t(u64, next, + find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset)); bucket.offset = next; if (bucket.offset == ca->mi.nbuckets) @@ -1124,18 +1109,17 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) if (ret) goto err; - u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; + u64 nr_buckets = 0, nr_mismatches = 0; for_each_member_device(c, ca) { nr_buckets += ca->mi.nbuckets; - nr_mismatches += ca->bucket_backpointer_mismatches[0].nr; - nr_empty += ca->bucket_backpointer_mismatches[1].nr; + nr_mismatches += ca->bucket_backpointer_mismatch.nr; } - if (!nr_mismatches && !nr_empty) + if (!nr_mismatches) goto err; bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", - nr_mismatches + nr_empty, nr_buckets); + nr_mismatches, nr_buckets); while (1) { ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); @@ -1171,9 +1155,10 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_bkey_buf_exit(&s.last_flushed, c); bch2_btree_cache_unpin(c); - for_each_member_device(c, ca) - for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) - bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatches[i]); + for_each_member_device(c, ca) { + bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); + } bch_err_fn(c, ret); return ret; @@ -1297,6 +1282,42 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) return ret; } +static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit) +{ + scoped_guard(mutex, &b->lock) { + if (!b->buckets) { + b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), GFP_KERNEL); + if (!b->buckets) + return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + } + + b->nr += !__test_and_set_bit(bit, b->buckets); + } + + return 0; +} + +int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_size) +{ + scoped_guard(mutex, &b->lock) { + if (!b->buckets) + return 0; + + unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), + sizeof(unsigned long), GFP_KERNEL); + if (!n) + return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + + memcpy(n, b->buckets, + BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); + kvfree(b->buckets); + b->buckets = n; + } + + return 0; +} + void bch2_bucket_bitmap_free(struct bucket_bitmap *b) { mutex_lock(&b->lock); diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index f57098c32143..fe7149a2fbf5 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -188,6 +188,13 @@ int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); int bch2_check_backpointers_to_extents(struct bch_fs *); +static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) +{ + unsigned long *bitmap = READ_ONCE(b->buckets); + return bitmap && test_bit(i, bitmap); +} + +int bch2_bucket_bitmap_resize(struct bucket_bitmap *, u64, u64); void bch2_bucket_bitmap_free(struct bucket_bitmap *); #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index e1680b635fe1..b58fad743fc4 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -626,7 +626,8 @@ struct bch_dev { u8 *oldest_gen; unsigned long *buckets_nouse; - struct bucket_bitmap bucket_backpointer_mismatches[2]; + struct bucket_bitmap bucket_backpointer_mismatch; + struct bucket_bitmap bucket_backpointer_empty; struct bch_dev_usage_full __percpu *usage; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index ca6e58d6fbc8..8bb6384190c5 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1324,27 +1324,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) sizeof(bucket_gens->b[0]) * copy); } - for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) { - struct bucket_bitmap *bitmap = &ca->bucket_backpointer_mismatches[i]; - - mutex_lock(&bitmap->lock); - if (bitmap->buckets) { - unsigned long *n = kvcalloc(BITS_TO_LONGS(nbuckets), - sizeof(unsigned long), GFP_KERNEL); - if (!n) { - mutex_unlock(&bitmap->lock); - ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; - goto err; - } - - memcpy(n, bitmap->buckets, - BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvfree(bitmap->buckets); - bitmap->buckets = n; - - } - mutex_unlock(&bitmap->lock); - } + ret = bch2_bucket_bitmap_resize(&ca->bucket_backpointer_mismatch, + ca->mi.nbuckets, nbuckets) ?: + bch2_bucket_bitmap_resize(&ca->bucket_backpointer_empty, + ca->mi.nbuckets, nbuckets); rcu_assign_pointer(ca->bucket_gens, bucket_gens); bucket_gens = old_bucket_gens; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 4bfdb1befb9a..0a751a65386f 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" @@ -76,7 +77,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, if (ca->mi.state != BCH_MEMBER_STATE_rw || !bch2_dev_is_online(ca)) - goto out_put; + goto out; struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); @@ -85,9 +86,8 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); ret = lru_idx && lru_idx <= time; -out_put: - bch2_dev_put(ca); out: + bch2_dev_put(ca); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 170b0f26c018..24658bf450ab 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1366,8 +1366,8 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); - for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) - bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatches[i]); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); bch2_free_super(&ca->disk_sb); bch2_dev_allocator_background_exit(ca); @@ -1499,8 +1499,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, atomic_long_set(&ca->ref, 1); #endif - for (unsigned i = 0; i < ARRAY_SIZE(ca->bucket_backpointer_mismatches); i++) - mutex_init(&ca->bucket_backpointer_mismatches[i].lock); + mutex_init(&ca->bucket_backpointer_mismatch.lock); + mutex_init(&ca->bucket_backpointer_empty.lock); bch2_dev_allocator_background_init(ca); From 39cea302f13a0a9dc4cf39248529a42e79d06842 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 May 2025 17:01:05 -0400 Subject: [PATCH 177/218] bcachefs: bch2_check_bucket_backpointer_mismatch() Detect buckets with missing backpointers, and run repair on demand. __bch2_move_data_phys() now calls bch2_check_bucket_backpointer_mismatch() as it walks buckets, which checks for missing backpointers by comparing backpointers against bucket sector counts. When missing backpointers are detected, we kick off bch2_check_extents_to_backpointers() asynchronously - right away if we're trying to evacuate, or with a threshold if we're just running copygc. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 7 +++- fs/bcachefs/backpointers.c | 75 +++++++++++++++++++++++++++++++--- fs/bcachefs/backpointers.h | 3 +- fs/bcachefs/move.c | 21 ++++++++-- fs/bcachefs/movinggc.c | 3 ++ 5 files changed, 98 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 88e710ba2685..a38b9c6c891e 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2175,8 +2175,11 @@ static int invalidate_one_bucket(struct btree_trans *trans, BUG_ON(a->data_type != BCH_DATA_cached); BUG_ON(a->dirty_sectors); - if (!a->cached_sectors) - bch_err(c, "invalidating empty bucket, confused"); + if (!a->cached_sectors) { + bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset, + true, last_flushed); + goto out; + } unsigned cached_sectors = a->cached_sectors; u8 gen = a->gen; diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 6b98ce1ed6c9..c08bc6685078 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -12,6 +12,7 @@ #include "disk_accounting.h" #include "error.h" #include "progress.h" +#include "recovery_passes.h" #include @@ -804,6 +805,13 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, return ret; } +static inline int bch2_fs_going_ro(struct bch_fs *c) +{ + return test_bit(BCH_FS_going_ro, &c->flags) + ? -EROFS + : 0; +} + static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct extents_to_bp_state *s) { @@ -831,6 +839,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, ret = for_each_btree_key_continue(trans, iter, 0, k, ({ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); + bch2_fs_going_ro(c) ?: check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); })); @@ -870,6 +879,7 @@ static int data_type_to_alloc_counter(enum bch_data_type t) static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, + bool *had_mismatch, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; @@ -877,6 +887,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); bool need_commit = false; + *had_mismatch = false; + if (a->data_type == BCH_DATA_sb || a->data_type == BCH_DATA_journal || a->data_type == BCH_DATA_parity) @@ -957,6 +969,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty, alloc_k.k->p.offset) : 0); + + *had_mismatch = true; } err: bch2_dev_put(ca); @@ -1104,7 +1118,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, ({ - check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); + bool had_mismatch; + bch2_fs_going_ro(c) ?: + check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed); })); if (ret) goto err; @@ -1150,20 +1166,69 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) s.bp_start = bpos_successor(s.bp_end); } -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&s.last_flushed, c); - bch2_btree_cache_unpin(c); for_each_member_device(c, ca) { bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); } +err: + bch2_trans_put(trans); + bch2_bkey_buf_exit(&s.last_flushed, c); + bch2_btree_cache_unpin(c); bch_err_fn(c, ret); return ret; } +static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, + struct bpos bucket, + bool *had_mismatch, + struct bkey_buf *last_flushed) +{ + struct btree_iter alloc_iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter, + BTREE_ID_alloc, bucket, + BTREE_ITER_cached); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed); + bch2_trans_iter_exit(trans, &alloc_iter); + return ret; +} + +int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, + struct bch_dev *ca, u64 bucket, + bool copygc, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + bool had_mismatch; + int ret = lockrestart_do(trans, + check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket), + &had_mismatch, last_flushed)); + if (ret || !had_mismatch) + return ret; + + u64 nr = ca->bucket_backpointer_mismatch.nr; + u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0; + + struct printbuf buf = PRINTBUF; + __bch2_log_msg_start(ca->name, &buf); + + prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n", + bucket, nr, ca->mi.nbuckets); + + bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_extents_to_backpointers, + nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + return 0; +} + /* backpointers -> extents */ static int check_one_backpointer(struct btree_trans *trans, diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index fe7149a2fbf5..6840561084ce 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -182,7 +182,8 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_b struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, struct btree_iter *, struct bkey_buf *); -int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bpos, struct bkey_buf *); +int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64, + bool, struct bkey_buf *); int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 49898d5743d4..0dd3bec3acff 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -815,6 +815,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, u64 bucket_start, u64 bucket_end, unsigned data_types, + bool copygc, move_pred_fn pred, void *arg) { struct btree_trans *trans = ctxt->trans; @@ -825,6 +826,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct bkey_buf sk; struct bkey_s_c k; struct bkey_buf last_flushed; + u64 check_mismatch_done = bucket_start; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, dev); @@ -835,8 +837,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); - bch2_dev_put(ca); - ca = NULL; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -871,6 +871,14 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (!k.k || bkey_gt(k.k->p, bp_end)) break; + if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { + while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { + bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, + copygc, &last_flushed); + } + continue; + } + if (k.k->type != KEY_TYPE_backpointer) goto next; @@ -946,10 +954,15 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, next: bch2_btree_iter_advance(trans, &bp_iter); } + + while (check_mismatch_done < bucket_end) + bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, + copygc, &last_flushed); err: bch2_trans_iter_exit(trans, &bp_iter); bch2_bkey_buf_exit(&sk, c); bch2_bkey_buf_exit(&last_flushed, c); + bch2_dev_put(ca); return ret; } @@ -974,7 +987,8 @@ int bch2_move_data_phys(struct bch_fs *c, ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; } - int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); + int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, + data_types, false, pred, arg); bch2_moving_ctxt_exit(&ctxt); return ret; @@ -1019,6 +1033,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bucket.offset, bucket.offset + 1, ~0, + true, evacuate_bucket_pred, &arg); } diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 0a751a65386f..7cb0b3d347b4 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -75,6 +75,9 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, if (!ca) goto out; + if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) + goto out; + if (ca->mi.state != BCH_MEMBER_STATE_rw || !bch2_dev_is_online(ca)) goto out; From 6b86da9282b0f6a3fb7aae709dca9feb4c8316b5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 May 2025 15:58:23 -0400 Subject: [PATCH 178/218] bcachefs: fsck: Include loops in error messages This fixes the subvol loop checking and directory loop checking to print the loop. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 2a7f418f3d87..9d94d31cfec9 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2502,7 +2502,14 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, u32 parent = le32_to_cpu(s.v->fs_path_parent); if (darray_u32_has(&subvol_path, parent)) { - if (fsck_err(trans, subvol_loop, "subvolume loop")) + printbuf_reset(&buf); + prt_printf(&buf, "subvolume loop:\n"); + + darray_for_each_reverse(subvol_path, i) + prt_printf(&buf, "%u ", *i); + prt_printf(&buf, "%u", parent); + + if (fsck_err(trans, subvol_loop, "%s", buf.buf)) ret = reattach_subvol(trans, s); break; } @@ -2518,7 +2525,8 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, trans, subvol_unreachable, "unreachable subvolume %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = reattach_subvol(trans, s); break; @@ -2674,14 +2682,13 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) redo_bi_depth = true; if (path_is_dup(&path, inode.bi_inum, snapshot)) { - /* XXX print path */ - bch_err(c, "directory structure loop"); + printbuf_reset(&buf); + prt_printf(&buf, "directory structure loop:\n"); + darray_for_each_reverse(path, i) + prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot); + prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot); - darray_for_each(path, i) - pr_err("%llu:%u", i->inum, i->snapshot); - pr_err("%llu:%u", inode.bi_inum, snapshot); - - if (fsck_err(trans, dir_loop, "directory structure loop")) { + if (fsck_err(trans, dir_loop, "%s", buf.buf)) { ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); if (ret) From 8c3fc7cca38459751489f5015f3282a64e452b7e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 May 2025 19:53:50 -0400 Subject: [PATCH 179/218] bcachefs: fix bch2_debugfs_flush_buf() when tabstops are in use Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 4ee5d486b305..4fa70634c90e 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -320,6 +320,11 @@ ssize_t bch2_debugfs_flush_buf(struct dump_iter *i) i->buf.pos -= copied; memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); + if (i->buf.last_newline >= copied) + i->buf.last_newline -= copied; + if (i->buf.last_field >= copied) + i->buf.last_field -= copied; + if (copied != bytes) return -EFAULT; } From 51e23c9d60a42f8da4d2f4d48c86eb00c4e351ea Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 May 2025 19:54:39 -0400 Subject: [PATCH 180/218] bcachefs: async objs now support bch_write_ops Signed-off-by: Kent Overstreet --- fs/bcachefs/async_objs.c | 6 ++++++ fs/bcachefs/async_objs_types.h | 1 + fs/bcachefs/io_write.c | 6 ++++++ fs/bcachefs/io_write_types.h | 4 ++++ fs/bcachefs/move.c | 1 - 5 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c index 57e2fe421461..a7cd1f0f0964 100644 --- a/fs/bcachefs/async_objs.c +++ b/fs/bcachefs/async_objs.c @@ -9,6 +9,7 @@ #include "btree_io.h" #include "debug.h" #include "io_read.h" +#include "io_write.h" #include @@ -22,6 +23,11 @@ static void rbio_obj_to_text(struct printbuf *out, void *obj) bch2_read_bio_to_text(out, obj); } +static void write_op_obj_to_text(struct printbuf *out, void *obj) +{ + bch2_write_op_to_text(out, obj); +} + static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj) { struct btree_read_bio *rbio = obj; diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h index 310a4f90f49b..8d713c0f5841 100644 --- a/fs/bcachefs/async_objs_types.h +++ b/fs/bcachefs/async_objs_types.h @@ -5,6 +5,7 @@ #define BCH_ASYNC_OBJ_LISTS() \ x(promote) \ x(rbio) \ + x(write_op) \ x(btree_read_bio) \ x(btree_write_bio) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 399df8fede8b..fd4b89d6a96a 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -6,6 +6,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "bkey_buf.h" #include "bset.h" #include "btree_update.h" @@ -547,6 +548,7 @@ static void bch2_write_done(struct closure *cl) EBUG_ON(cl->parent); closure_debug_destroy(cl); + async_object_list_del(c, write_op, op->list_idx); if (op->end_io) op->end_io(op); } @@ -1673,6 +1675,8 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + async_object_list_add(c, write_op, op, &op->list_idx); + if (op->flags & BCH_WRITE_only_specified_devs) op->flags |= BCH_WRITE_alloc_nowait; @@ -1717,6 +1721,7 @@ CLOSURE_CALLBACK(bch2_write) bch2_disk_reservation_put(c, &op->res); closure_debug_destroy(&op->cl); + async_object_list_del(c, write_op, op->list_idx); if (op->end_io) op->end_io(op); } @@ -1750,6 +1755,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); + prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); printbuf_indent_sub(out, 2); } diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index b4a6a44a45d0..5da4eb8bb6f6 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -71,6 +71,10 @@ struct bch_write_op { void (*end_io)(struct bch_write_op *); u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif + unsigned written; /* sectors */ u16 flags; s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 0dd3bec3acff..79f4722621d5 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -109,7 +109,6 @@ static void move_write_done(struct bch_write_op *op) struct printbuf buf = PRINTBUF; bch2_write_op_to_text(&buf, op); - prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error)); trace_io_move_write_fail(c, buf.buf); printbuf_exit(&buf); } From 81c42933a50766f5230384375e28b5cd64a46113 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 May 2025 22:32:51 -0400 Subject: [PATCH 181/218] bcachefs: Make accounting mismatch errors more readable Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 2786a684b0c8..a26bc81a8f49 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -580,11 +580,11 @@ int bch2_gc_accounting_done(struct bch_fs *c) prt_str(&buf, "accounting mismatch for "); bch2_accounting_key_to_text(&buf, &acc_k); - prt_str(&buf, ": got"); + prt_str(&buf, ":\n got"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", dst_v[j]); - prt_str(&buf, " should be"); + prt_str(&buf, "\nshould be"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", src_v[j]); From 247abee6ae6d2c6f283857b16fbf4bf201e72061 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 20:23:58 -0400 Subject: [PATCH 182/218] bcachefs: btree_trans_subbuf Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/btree_iter.c | 4 +-- fs/bcachefs/btree_trans_commit.c | 31 +++++++++++---------- fs/bcachefs/btree_types.h | 10 +++++-- fs/bcachefs/btree_update.c | 35 ++++++++++++------------ fs/bcachefs/btree_update.h | 47 +++++++++++++++++++++++++------- fs/bcachefs/disk_accounting.c | 4 +-- fs/bcachefs/disk_accounting.h | 4 +-- 8 files changed, 82 insertions(+), 54 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b58fad743fc4..3077f15439cd 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -712,7 +712,6 @@ struct btree_transaction_stats { struct bch2_time_stats lock_hold_times; struct mutex lock; unsigned nr_max_paths; - unsigned journal_entries_size; unsigned max_mem; #ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE darray_trans_kmalloc_trace trans_kmalloc_trace; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e0c1e873c886..0f0b80c8c29a 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1499,7 +1499,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) prt_newline(buf); } - for (struct jset_entry *e = trans->journal_entries; + for (struct jset_entry *e = btree_trans_journal_entries_start(trans); e != btree_trans_journal_entries_top(trans); e = vstruct_next(e)) { bch2_journal_entry_to_text(buf, trans->c, e); @@ -3280,7 +3280,6 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->restart_count++; trans->mem_top = 0; - trans->journal_entries = NULL; trans_for_each_path(trans, path, i) { path->should_be_locked = false; @@ -3438,7 +3437,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) } trans->nr_paths_max = s->nr_max_paths; - trans->journal_entries_size = s->journal_entries_size; } trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 7e17df1df7f1..abbecddb18ee 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -663,15 +663,16 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, h = h->next; } - struct jset_entry *entry = trans->journal_entries; + struct jset_entry *entry; percpu_down_read(&c->mark_lock); - for (entry = trans->journal_entries; - entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + for (entry = btree_trans_journal_entries_start(trans); + entry != btree_trans_journal_entries_top(trans); entry = vstruct_next(entry)) if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && entry->start->k.type == KEY_TYPE_accounting) { - ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); + ret = bch2_accounting_trans_commit_hook(trans, + bkey_i_to_accounting(entry->start), flags); if (ret) goto revert_fs_usage; } @@ -698,8 +699,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); i = vstruct_next(i)) { ret = bch2_journal_entry_validate(c, NULL, i, bcachefs_metadata_version_current, @@ -754,11 +755,11 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->journal_entries, - trans->journal_entries_u64s); + btree_trans_journal_entries_start(trans), + trans->journal_entries.u64s); - trans->journal_res.offset += trans->journal_entries_u64s; - trans->journal_res.u64s -= trans->journal_entries_u64s; + trans->journal_res.offset += trans->journal_entries.u64s; + trans->journal_res.u64s -= trans->journal_entries.u64s; if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; @@ -780,7 +781,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); percpu_down_read(&c->mark_lock); revert_fs_usage: - for (struct jset_entry *entry2 = trans->journal_entries; + for (struct jset_entry *entry2 = btree_trans_journal_entries_start(trans); entry2 != entry; entry2 = vstruct_next(entry2)) if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && @@ -961,8 +962,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) return ret; } - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); i = vstruct_next(i)) if (i->type == BCH_JSET_ENTRY_btree_keys || i->type == BCH_JSET_ENTRY_write_buffer_keys) { @@ -987,7 +988,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; if (!trans->nr_updates && - !trans->journal_entries_u64s) + !trans->journal_entries.u64s) goto out_reset; ret = bch2_trans_commit_run_triggers(trans); @@ -1005,7 +1006,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->journal_entries_u64s; + trans->journal_u64s = trans->journal_entries.u64s; trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index e5a965db68b4..7ebf43fc8bae 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -480,6 +480,12 @@ struct trans_kmalloc_trace { }; typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; +struct btree_trans_subbuf { + u16 base; + u16 u64s; + u16 size;; +}; + struct btree_trans { struct bch_fs *c; @@ -534,9 +540,7 @@ struct btree_trans { int srcu_idx; /* update path: */ - u16 journal_entries_u64s; - u16 journal_entries_size; - struct jset_entry *journal_entries; + struct btree_trans_subbuf journal_entries; struct btree_trans_commit_hook *hooks; struct journal_entry_pin *journal_pin; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index ce83cd037551..20fba8d17431 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -565,30 +565,29 @@ int bch2_btree_insert_clone_trans(struct btree_trans *trans, return bch2_btree_insert_trans(trans, btree, n, 0); } -struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) +void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, + struct btree_trans_subbuf *buf, + unsigned u64s) { - unsigned new_top = trans->journal_entries_u64s + u64s; - unsigned old_size = trans->journal_entries_size; + unsigned new_top = buf->u64s + u64s; + unsigned old_size = buf->size; - if (new_top > trans->journal_entries_size) { - trans->journal_entries_size = roundup_pow_of_two(new_top); + if (new_top > buf->size) + buf->size = roundup_pow_of_two(new_top); - btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size; - } - - struct jset_entry *n = - bch2_trans_kmalloc_nomemzero(trans, - trans->journal_entries_size * sizeof(u64)); + void *n = bch2_trans_kmalloc_nomemzero(trans, buf->size * sizeof(u64)); if (IS_ERR(n)) - return ERR_CAST(n); + return n; - if (trans->journal_entries) - memcpy(n, trans->journal_entries, old_size * sizeof(u64)); - trans->journal_entries = n; + if (buf->u64s) + memcpy(n, + btree_trans_subbuf_base(trans, buf), + old_size * sizeof(u64)); + buf->base = (u64 *) n - (u64 *) trans->mem; - struct jset_entry *e = btree_trans_journal_entries_top(trans); - trans->journal_entries_u64s = new_top; - return e; + void *p = btree_trans_subbuf_top(trans, buf); + buf->u64s = new_top; + return p; } int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 3a246610b673..8964b321804c 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -113,23 +113,49 @@ bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_); } -struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); +static inline void *btree_trans_subbuf_base(struct btree_trans *trans, + struct btree_trans_subbuf *buf) +{ + return (u64 *) trans->mem + buf->base; +} + +static inline void *btree_trans_subbuf_top(struct btree_trans *trans, + struct btree_trans_subbuf *buf) +{ + return (u64 *) trans->mem + buf->base + buf->u64s; +} + +void *__bch2_trans_subbuf_alloc(struct btree_trans *, + struct btree_trans_subbuf *, + unsigned); + +static inline void * +bch2_trans_subbuf_alloc(struct btree_trans *trans, + struct btree_trans_subbuf *buf, + unsigned u64s) +{ + if (buf->u64s + u64s > buf->size) + return __bch2_trans_subbuf_alloc(trans, buf, u64s); + + void *p = btree_trans_subbuf_top(trans, buf); + buf->u64s += u64s; + return p; +} + +static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans) +{ + return btree_trans_subbuf_base(trans, &trans->journal_entries); +} static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) { - return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + return btree_trans_subbuf_top(trans, &trans->journal_entries); } static inline struct jset_entry * bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) { - if (!trans->journal_entries || - trans->journal_entries_u64s + u64s > trans->journal_entries_size) - return __bch2_trans_jset_entry_alloc(trans, u64s); - - struct jset_entry *e = btree_trans_journal_entries_top(trans); - trans->journal_entries_u64s += u64s; - return e; + return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s); } int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); @@ -227,7 +253,8 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) bch2_path_put(trans, i->path, true); trans->nr_updates = 0; - trans->journal_entries_u64s = 0; + trans->journal_entries.u64s = 0; + trans->journal_entries.size = 0; trans->hooks = NULL; trans->extra_disk_res = 0; } diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index a26bc81a8f49..04e0d2ac2727 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -307,8 +307,8 @@ static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) */ int bch2_accounting_update_sb(struct btree_trans *trans) { - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); i = vstruct_next(i)) if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 54cb8a5b117d..54fa3e098c30 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -259,8 +259,8 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, struct bkey_i_accounting *a, unsigned commit_flags) { - a->k.bversion = journal_pos_to_bversion(&trans->journal_res, - (u64 *) a - (u64 *) trans->journal_entries); + u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->journal_entries); + a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base); EBUG_ON(bversion_zero(a->k.bversion)); From e8f9992b0aab188ce37ef011a3ec8613f02d05aa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 20:26:01 -0400 Subject: [PATCH 183/218] bcachefs: Split out accounting in transaction commit There can be a lot of rendundancy in accounting updates within a single btree transaction. Split out accounting updates so that they can be deduped, in the next commit. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 51 +++++++++++++++++++------------- fs/bcachefs/btree_types.h | 1 + fs/bcachefs/btree_update.h | 2 ++ fs/bcachefs/disk_accounting.c | 23 +++++++------- fs/bcachefs/disk_accounting.h | 2 +- fs/bcachefs/recovery.c | 7 ++++- 6 files changed, 52 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index abbecddb18ee..1c03c965d836 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -663,19 +663,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, h = h->next; } - struct jset_entry *entry; + struct bkey_i *accounting; percpu_down_read(&c->mark_lock); - for (entry = btree_trans_journal_entries_start(trans); - entry != btree_trans_journal_entries_top(trans); - entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && - entry->start->k.type == KEY_TYPE_accounting) { - ret = bch2_accounting_trans_commit_hook(trans, - bkey_i_to_accounting(entry->start), flags); - if (ret) - goto revert_fs_usage; - } + for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); + accounting != btree_trans_subbuf_top(trans, &trans->accounting); + accounting = bkey_next(accounting)) { + ret = bch2_accounting_trans_commit_hook(trans, + bkey_i_to_accounting(accounting), flags); + if (ret) + goto revert_fs_usage; + } percpu_up_read(&c->mark_lock); /* XXX: we only want to run this if deltas are nonzero */ @@ -761,6 +759,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans->journal_res.offset += trans->journal_entries.u64s; trans->journal_res.u64s -= trans->journal_entries.u64s; + memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_write_buffer_keys, + BTREE_ID_accounting, 0, + trans->accounting.u64s)->_data, + btree_trans_subbuf_base(trans, &trans->accounting), + trans->accounting.u64s); + if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; } @@ -781,13 +786,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); percpu_down_read(&c->mark_lock); revert_fs_usage: - for (struct jset_entry *entry2 = btree_trans_journal_entries_start(trans); - entry2 != entry; - entry2 = vstruct_next(entry2)) - if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && - entry2->start->k.type == KEY_TYPE_accounting) - bch2_accounting_trans_commit_revert(trans, - bkey_i_to_accounting(entry2->start), flags); + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != accounting; + i = bkey_next(i)) + bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); percpu_up_read(&c->mark_lock); return ret; } @@ -972,6 +974,14 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) return ret; } + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != btree_trans_subbuf_top(trans, &trans->accounting); + i = bkey_next(i)) { + int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i); + if (ret) + return ret; + } + return 0; } @@ -988,7 +998,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; if (!trans->nr_updates && - !trans->journal_entries.u64s) + !trans->journal_entries.u64s && + !trans->accounting.u64s) goto out_reset; ret = bch2_trans_commit_run_triggers(trans); @@ -1006,7 +1017,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->journal_entries.u64s; + trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s); trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 7ebf43fc8bae..9d641bf9d2a2 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -541,6 +541,7 @@ struct btree_trans { /* update path: */ struct btree_trans_subbuf journal_entries; + struct btree_trans_subbuf accounting; struct btree_trans_commit_hook *hooks; struct journal_entry_pin *journal_pin; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 8964b321804c..a54dc7277177 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -255,6 +255,8 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) trans->nr_updates = 0; trans->journal_entries.u64s = 0; trans->journal_entries.size = 0; + trans->accounting.u64s = 0; + trans->accounting.size = 0; trans->hooks = NULL; trans->extra_disk_res = 0; } diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 04e0d2ac2727..088b015fc198 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -96,13 +96,13 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, if (likely(!gc)) { unsigned u64s = sizeof(struct bkey_i_accounting) / sizeof(u64) + nr; - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - int ret = PTR_ERR_OR_ZERO(e); + struct bkey_i_accounting *a = + bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); + int ret = PTR_ERR_OR_ZERO(a); if (ret) return ret; - journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, BTREE_ID_accounting, 0, u64s); - accounting_key_init(e->start, k, d, nr); + accounting_key_init(&a->k_i, k, d, nr); return 0; } else { struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; @@ -307,14 +307,13 @@ static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) */ int bch2_accounting_update_sb(struct btree_trans *trans) { - for (struct jset_entry *i = btree_trans_journal_entries_start(trans); - i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) - if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { - int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); - if (ret) - return ret; - } + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != btree_trans_subbuf_top(trans, &trans->accounting); + i = bkey_next(i)) { + int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); + if (ret) + return ret; + } return 0; } diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 54fa3e098c30..f6098e33ab30 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -259,7 +259,7 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, struct bkey_i_accounting *a, unsigned commit_flags) { - u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->journal_entries); + u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base); EBUG_ON(bversion_zero(a->k.bversion)); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 0f954567ea45..4fca57575565 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -286,7 +286,12 @@ static int bch2_journal_replay_key(struct btree_trans *trans, goto out; if (k->k->k.type == KEY_TYPE_accounting) { - ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k); + struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto out; + + bkey_copy(n, k->k); goto out; } From 7fd643c032ae0ced53d57fc23981c4d3d269d352 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 20:43:18 -0400 Subject: [PATCH 184/218] bcachefs: Coalesce accounting in trans commit Accounting has gotten quite heavy, and there's lots of redundancy in accounting updates within a transaction, as we often add/delete multiple extents that touch the same accountign counters. This will reduce the amount of data that we journal, and reduce pressure downstream on the btree write buffer. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 36 ++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 088b015fc198..9f20db560eab 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -68,23 +68,31 @@ static const char * const disk_accounting_type_strs[] = { NULL }; -static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, - s64 *d, unsigned nr) +static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, + s64 *d, unsigned nr) { struct bkey_i_accounting *acc = bkey_accounting_init(k); - acc->k.p = disk_accounting_pos_to_bpos(pos); + acc->k.p = pos; set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); memcpy_u64s_small(acc->v.d, d, nr); } +static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, + s64 *d, unsigned nr) +{ + return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); +} + static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); int bch2_disk_accounting_mod(struct btree_trans *trans, struct disk_accounting_pos *k, s64 *d, unsigned nr, bool gc) { + BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); + /* Normalize: */ switch (k->type) { case BCH_DISK_ACCOUNTING_replicas: @@ -92,22 +100,32 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, break; } - BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); + struct bpos pos = disk_accounting_pos_to_bpos(k); if (likely(!gc)) { - unsigned u64s = sizeof(struct bkey_i_accounting) / sizeof(u64) + nr; - struct bkey_i_accounting *a = - bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); + struct bkey_i_accounting *a; + + for (a = btree_trans_subbuf_base(trans, &trans->accounting); + a != btree_trans_subbuf_top(trans, &trans->accounting); + a = (void *) bkey_next(&a->k_i)) + if (bpos_eq(a->k.p, pos)) { + BUG_ON(nr != bch2_accounting_counters(&a->k)); + acc_u64s(a->v.d, d, nr); + return 0; + } + + unsigned u64s = sizeof(*a) / sizeof(u64) + nr; + a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); int ret = PTR_ERR_OR_ZERO(a); if (ret) return ret; - accounting_key_init(&a->k_i, k, d, nr); + __accounting_key_init(&a->k_i, pos, d, nr); return 0; } else { struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - accounting_key_init(&k_i.k, k, d, nr); + __accounting_key_init(&k_i.k, pos, d, nr); int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); if (ret == -BCH_ERR_btree_insert_need_mark_replicas) From f132a78095b6a67f717657a09640539cb847d2d6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 17:07:06 -0400 Subject: [PATCH 185/218] bcachefs: Simplify bch2_extent_atomic_end() It used to be that we had a fixed maximum number of btree paths to work with - 64. That's no longer the case, so bch2_extent_atomic_end() doesn't have to be as strict. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 4 +-- fs/bcachefs/extent_update.c | 67 ++++++++++------------------------- fs/bcachefs/extent_update.h | 2 +- 3 files changed, 22 insertions(+), 51 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 9f20db560eab..488c342b9cd7 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -104,7 +104,7 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, if (likely(!gc)) { struct bkey_i_accounting *a; - +#if 0 for (a = btree_trans_subbuf_base(trans, &trans->accounting); a != btree_trans_subbuf_top(trans, &trans->accounting); a = (void *) bkey_next(&a->k_i)) @@ -113,7 +113,7 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, acc_u64s(a->v.d, d, nr); return 0; } - +#endif unsigned u64s = sizeof(*a) / sizeof(u64) + nr; a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); int ret = PTR_ERR_OR_ZERO(a); diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 6bb42985306e..b899ee75f5b9 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -37,16 +37,17 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) return lru + ret * 2; } +#define EXTENT_ITERS_MAX 64 + static int count_iters_for_insert(struct btree_trans *trans, struct bkey_s_c k, unsigned offset, struct bpos *end, - unsigned *nr_iters, - unsigned max_iters) + unsigned *nr_iters) { int ret = 0, ret2 = 0; - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { *end = bpos_min(*end, k.k->p); ret = 1; } @@ -56,7 +57,7 @@ static int count_iters_for_insert(struct btree_trans *trans, case KEY_TYPE_reflink_v: *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { *end = bpos_min(*end, k.k->p); ret = 1; } @@ -81,7 +82,7 @@ static int count_iters_for_insert(struct btree_trans *trans, *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { struct bpos pos = bkey_start_pos(k.k); pos.offset += min_t(u64, k.k->size, r_k.k->p.offset - idx); @@ -100,59 +101,31 @@ static int count_iters_for_insert(struct btree_trans *trans, return ret2 ?: ret; } -#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) - int bch2_extent_atomic_end(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, struct bpos *end) { - struct btree_iter copy; - struct bkey_s_c k; unsigned nr_iters = 0; - int ret; - - ret = bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - - *end = insert->k.p; - - /* extent_update_to_keys(): */ - nr_iters += 1; - - ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, - &nr_iters, EXTENT_ITERS_MAX / 2); - if (ret < 0) - return ret; + struct btree_iter copy; bch2_trans_copy_iter(trans, ©, iter); - for_each_btree_key_max_continue_norestart(trans, copy, insert->k.p, 0, k, ret) { + int ret = bch2_btree_iter_traverse(trans, ©); + if (ret) + goto err; + + struct bkey_s_c k; + for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) { unsigned offset = 0; - if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); + if (bkey_gt(iter->pos, bkey_start_pos(k.k))) + offset = iter->pos.offset - bkey_start_offset(k.k); - /* extent_handle_overwrites(): */ - switch (bch2_extent_overlap(&insert->k, k.k)) { - case BCH_EXTENT_OVERLAP_ALL: - case BCH_EXTENT_OVERLAP_FRONT: - nr_iters += 1; - break; - case BCH_EXTENT_OVERLAP_BACK: - case BCH_EXTENT_OVERLAP_MIDDLE: - nr_iters += 2; - break; - } - - ret = count_iters_for_insert(trans, k, offset, end, - &nr_iters, EXTENT_ITERS_MAX); + ret = count_iters_for_insert(trans, k, offset, end, &nr_iters); if (ret) break; } - +err: bch2_trans_iter_exit(trans, ©); return ret < 0 ? ret : 0; } @@ -161,10 +134,8 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k) { - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(trans, iter, k, &end); + struct bpos end = k->k.p; + int ret = bch2_extent_atomic_end(trans, iter, &end); if (ret) return ret; diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h index 6f5cf449361a..34467db53f45 100644 --- a/fs/bcachefs/extent_update.h +++ b/fs/bcachefs/extent_update.h @@ -5,7 +5,7 @@ #include "bcachefs.h" int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, - struct bkey_i *, struct bpos *); + struct bpos *); int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, struct bkey_i *); From c631bb41f5a9cf266762290166a00132a04f215d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 May 2025 16:45:44 -0400 Subject: [PATCH 186/218] bcachefs: Call bch2_bkey_set_needs_rebalance() earlier in write path There's no reason to be running this inside our transaction; it forces us to copy the key we're updating to a temporary, which we'd like to skip. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index fd4b89d6a96a..52a60982a66b 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -403,8 +403,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_slots|BTREE_ITER_intent); - ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: - bch2_extent_update(trans, inum, &iter, sk.k, + ret = bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_check_enospc); @@ -475,6 +474,10 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, BUG_ON(c->opts.nochanges); + const struct bch_extent_ptr *last = NULL; + bkey_for_each_ptr(ptrs, ptr) + last = ptr; + bkey_for_each_ptr(ptrs, ptr) { /* * XXX: btree writes should be using io_ref[WRITE], but we @@ -485,7 +488,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, ? bch2_dev_have_ref(c, ptr->dev) : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); - if (to_entry(ptr + 1) < ptrs.end) { + if (ptr != last) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; @@ -796,6 +799,9 @@ static void init_append_extent(struct bch_write_op *op, bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, op->flags & BCH_WRITE_cached); + if (!(op->flags & BCH_WRITE_move)) + bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); + bch2_keylist_push(&op->insert_keys); } From a96c5e504538cf150ec6e3b19702f8c658298323 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Mon, 19 May 2025 19:51:04 +0800 Subject: [PATCH 187/218] bcachefs: Remove duplicate call to bch2_trans_begin() There is one in for_each_btree_key_max(). Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/movinggc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 7cb0b3d347b4..e7a2a13554d7 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -154,8 +154,6 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) return ret; - bch2_trans_begin(trans); - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), From c3a7fd95e02a33cad16f3c455e482951bcbc7224 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 19 May 2025 09:15:31 -0400 Subject: [PATCH 188/218] bcachefs: Don't set bi_casefold on non directories bi_casefold only makes sense for directories, and since it's one of the variable length fields setting it unnecessarily wastes space. Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index a17a952ea161..ee0080507855 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -856,6 +856,9 @@ void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, BCH_INODE_OPTS() #undef x } + + if (!S_ISDIR(mode)) + inode_u->bi_casefold = 0; } void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, From 011d644b769609ecf6ceef71a1411c4a9e008a5b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 19 May 2025 09:12:49 -0400 Subject: [PATCH 189/218] bcachefs: subvol_inum_eq() Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 3 --- fs/bcachefs/fs.c | 5 ----- fs/bcachefs/inode.h | 8 ++++++++ fs/bcachefs/namei.c | 6 ++---- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 3077f15439cd..7824da2af9d0 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -736,9 +736,6 @@ struct btree_trans_buf { struct btree_trans *trans; }; -#define BCACHEFS_ROOT_SUBVOL_INUM \ - ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) - #define BCH_WRITE_REFS() \ x(journal) \ x(trans) \ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 9916bd38a599..ddfe89d84966 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -191,11 +191,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -static bool subvol_inum_eq(subvol_inum a, subvol_inum b) -{ - return a.subvol == b.subvol && a.inum == b.inum; -} - static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index bb81b7c269bb..bf6624aadc56 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -309,6 +309,14 @@ bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode return io_opts_to_rebalance_opts(c, &io_opts); } +#define BCACHEFS_ROOT_SUBVOL_INUM \ + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + +static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b) +{ + return a.subvol == b.subvol && a.inum == b.inum; +} + int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); int bch2_delete_dead_inodes(struct bch_fs *); diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 9136a9097789..ce2d34698e64 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -404,8 +404,7 @@ int bch2_rename_trans(struct btree_trans *trans, src_hash = bch2_hash_info_init(c, src_dir_u); - if (dst_dir.inum != src_dir.inum || - dst_dir.subvol != src_dir.subvol) { + if (!subvol_inum_eq(dst_dir, src_dir)) { ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, BTREE_ITER_intent); if (ret) @@ -599,8 +598,7 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb unsigned orig_pos = path->pos; int ret = 0; - while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && - inum.inum == BCACHEFS_ROOT_INO)) { + while (!subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) { struct bch_inode_unpacked inode; ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); if (ret) From 7c4f22af251ae32762cbacb998f86977535ee0f2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 19 May 2025 09:17:39 -0400 Subject: [PATCH 190/218] bcachefs: bch2_rename_trans() only runs rename-to-dir code if needed Signed-off-by: Kent Overstreet --- fs/bcachefs/namei.c | 52 +++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index ce2d34698e64..e81e554b51d3 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -496,33 +496,35 @@ int bch2_rename_trans(struct btree_trans *trans, } } - if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && - S_ISDIR(src_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; + if (!subvol_inum_eq(dst_dir, src_dir)) { + if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && + S_ISDIR(src_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } + + if (mode == BCH_RENAME_EXCHANGE && + bch2_reinherit_attrs(dst_inode_u, src_dir_u) && + S_ISDIR(dst_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } + + if (is_subdir_for_nlink(src_inode_u)) { + src_dir_u->bi_nlink--; + dst_dir_u->bi_nlink++; + } + + if (S_ISDIR(src_inode_u->bi_mode) && + !src_inode_u->bi_subvol) + src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; + + if (mode == BCH_RENAME_EXCHANGE && + S_ISDIR(dst_inode_u->bi_mode) && + !dst_inode_u->bi_subvol) + dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; } - if (mode == BCH_RENAME_EXCHANGE && - bch2_reinherit_attrs(dst_inode_u, src_dir_u) && - S_ISDIR(dst_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } - - if (is_subdir_for_nlink(src_inode_u)) { - src_dir_u->bi_nlink--; - dst_dir_u->bi_nlink++; - } - - if (S_ISDIR(src_inode_u->bi_mode) && - !src_inode_u->bi_subvol) - src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; - - if (mode == BCH_RENAME_EXCHANGE && - S_ISDIR(dst_inode_u->bi_mode) && - !dst_inode_u->bi_subvol) - dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; - if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; From 77aeaa2f0fcac2d68fe1af61169fe26d5fac9a22 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 19 May 2025 09:48:50 -0400 Subject: [PATCH 191/218] bcachefs: bch2_inum_snapshot_to_path() Add a better helper for printing out paths of inodes when we don't know the subvolume, for fsck. Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 22 +++------------------- fs/bcachefs/namei.c | 18 ++++++++++++++++++ fs/bcachefs/namei.h | 2 ++ fs/bcachefs/snapshot.c | 29 +++++++++++++++++++---------- fs/bcachefs/snapshot.h | 2 +- 5 files changed, 43 insertions(+), 30 deletions(-) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index a476dd2c196e..c2cad28635bf 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -742,25 +742,9 @@ void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bpos pos) { - struct bch_fs *c = trans->c; - int ret = 0; - - if (!bch2_snapshot_is_leaf(c, pos.snapshot)) - prt_str(out, "(multiple snapshots) "); - - subvol_inum inum = { - .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot), - .inum = pos.inode, - }; - - if (inum.subvol) { - ret = bch2_inum_to_path(trans, inum, out); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - } - - if (!inum.subvol || ret) - prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); + int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out); + if (ret) + return ret; prt_printf(out, " offset %llu: ", pos.offset << 8); return 0; diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index e81e554b51d3..8088e810815f 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -654,6 +654,24 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb goto out; } +int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot, + snapshot_id_list *snapshot_overwrites, + struct printbuf *path) +{ + u32 subvol = bch2_snapshot_oldest_subvol(trans->c, snapshot, snapshot_overwrites); + int ret = 0; + + if (subvol) { + ret = bch2_inum_to_path(trans, (subvol_inum) { subvol, inum }, path); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + } + + if (!subvol || ret) + prt_printf(path, "inum %llu:%u", inum, snapshot); + return 0; +} + /* fsck */ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h index 2e6f6364767f..d4d2d2d69517 100644 --- a/fs/bcachefs/namei.h +++ b/fs/bcachefs/namei.h @@ -43,6 +43,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, struct bch_inode_unpacked *); int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); +int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32, + snapshot_id_list *, struct printbuf *); int __bch2_check_dirent_target(struct btree_trans *, struct btree_iter *, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 24903e7de296..00d62d1190ef 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -409,22 +409,31 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) return 0; } -u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, + snapshot_id_list *skip) { - u32 id = snapshot_root; - u32 subvol = 0, s; - + u32 id, subvol = 0, s; +retry: + id = snapshot_root; rcu_read_lock(); while (id && bch2_snapshot_exists(c, id)) { - s = snapshot_t(c, id)->subvol; - - if (s && (!subvol || s < subvol)) - subvol = s; + if (!(skip && snapshot_list_has_id(skip, id))) { + s = snapshot_t(c, id)->subvol; + if (s && (!subvol || s < subvol)) + subvol = s; + } id = bch2_snapshot_tree_next(c, id); + if (id == snapshot_root) + break; } rcu_read_unlock(); + if (!subvol && skip) { + skip = NULL; + goto retry; + } + return subvol; } @@ -456,7 +465,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, if (!ret && !found) { struct bkey_i_subvolume *u; - *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); + *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL); u = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, *subvol_id), @@ -673,7 +682,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); ret = PTR_ERR_OR_ZERO(u) ?: bch2_snapshot_tree_create(trans, root_id, - bch2_snapshot_tree_oldest_subvol(c, root_id), + bch2_snapshot_oldest_subvol(c, root_id, NULL), &tree_id); if (ret) goto err; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 63b9469eb1eb..382a171f5413 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -105,7 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) return id; } -u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32); +u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *); u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) From 77eac89c7943a4ed5c9bc42def9b3140d951184f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 19 May 2025 10:10:19 -0400 Subject: [PATCH 192/218] bcachefs: bch2_inode_find_by_inum_snapshot() Move a fsck.c helper into inode.c, eliminate some duplicate and organize the inode lookup helpers. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 31 ++--------- fs/bcachefs/inode.c | 132 +++++++++++++++++++++++++------------------- fs/bcachefs/inode.h | 28 +++++----- 3 files changed, 96 insertions(+), 95 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9d94d31cfec9..5402c40e3697 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -109,27 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inode_nr, snapshot), 0); - ret = bkey_err(k); - if (ret) - goto err; - - ret = bkey_is_inode(k.k) - ? bch2_inode_unpack(k, inode) - : -BCH_ERR_ENOENT_inode; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, @@ -231,7 +210,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; - ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); + ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", root_inum.inum, subvolid); if (ret) @@ -257,7 +236,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - ret = lookup_inode(trans, inum, snapshot, lostfound); + ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); return ret; @@ -2117,7 +2096,8 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * u64 target_inum = le64_to_cpu(s.v->inode); u32 target_snapshot = le32_to_cpu(s.v->snapshot); - ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root); + ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, + &subvol_root, 0); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -2434,7 +2414,8 @@ static int check_root_trans(struct btree_trans *trans) goto err; } - ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode); + ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, + &root_inode, 0); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index ee0080507855..13c1e9df252a 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -367,6 +367,82 @@ int __bch2_inode_peek(struct btree_trans *trans, return ret; } +int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, + u64 inode_nr, u32 snapshot, + struct bch_inode_unpacked *inode, + unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inode_nr, snapshot), flags); + int ret = bkey_err(k); + if (ret) + goto err; + + ret = bkey_is_inode(k.k) + ? bch2_inode_unpack(k, inode) + : -BCH_ERR_ENOENT_inode; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); +} + +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) { + ret = bch2_inode_unpack(k, root); + goto out; + } + } + /* We're only called when we know we have an inode for @inum */ + BUG_ON(!ret); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_inode_write_flags(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, @@ -1102,62 +1178,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) return ret; } -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); -} - -int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, - struct bch_inode_unpacked *root) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) { - ret = bch2_inode_unpack(k, root); - goto out; - } - } - /* We're only called when we know we have an inode for @inum */ - BUG_ON(!ret); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) { if (bi->bi_flags & BCH_INODE_unlinked) diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index bf6624aadc56..c31567c09b8a 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -134,10 +134,21 @@ static inline int bch2_inode_peek(struct btree_trans *trans, subvol_inum inum, unsigned flags) { return __bch2_inode_peek(trans, iter, inode, inum, flags, true); - int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); - return ret; } +int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32, + struct bch_inode_unpacked *, unsigned); +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, + subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); + +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root); + int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); @@ -165,17 +176,6 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, int bch2_inode_rm(struct bch_fs *, subvol_inum); -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, - subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, - struct bch_inode_unpacked *); - -int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, - struct bch_inode_unpacked *root); - #define inode_opt_get(_c, _inode, _name) \ ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) @@ -248,7 +248,7 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k) static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) { - /* inode apts are stored with a +1 bias: 0 means "unset, use fs opt" */ + /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */ return bi->bi_casefold ? bi->bi_casefold - 1 : c->opts.casefold; From 4ba99dde330b2d4b6de65f27ced60e7f0fbc21c2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 19 May 2025 10:31:44 -0400 Subject: [PATCH 193/218] bcachefs: BCH_INODE_has_case_insensitive Add a flag for tracking whether a directory has case-insensitive descendents - so that overlayfs can disallow mounting, even though the filesystem supports case insensitivity. This is a new on disk format version, with a (cheap) upgrade to ensure the flag is correctly set on existing inodes. Create, rename and fssetxattr are all plumbed to ensure the new flag is set, and we've got new fsck code that hooks into check_inode(0. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 +- fs/bcachefs/fsck.c | 10 +- fs/bcachefs/inode.c | 10 +- fs/bcachefs/inode.h | 2 +- fs/bcachefs/inode_format.h | 7 +- fs/bcachefs/namei.c | 166 +++++++++++++++++++++++++++++++++- fs/bcachefs/namei.h | 5 + fs/bcachefs/sb-downgrade.c | 6 +- 8 files changed, 196 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 5900ff3715c6..b4a04df5ea95 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -699,7 +699,8 @@ struct bch_sb_field_ext { x(casefolding, BCH_VERSION(1, 24)) \ x(extent_flags, BCH_VERSION(1, 25)) \ x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ - x(fast_device_removal, BCH_VERSION(1, 27)) + x(fast_device_removal, BCH_VERSION(1, 27)) \ + x(inode_has_case_insensitive, BCH_VERSION(1, 28)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 5402c40e3697..dbfa3e0b8abb 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -264,7 +264,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, lostfound); - bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); + bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); lostfound->bi_dir = root_inode.bi_inum; lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); @@ -545,7 +545,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); + bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); new_inode.bi_subvol = subvolid; @@ -635,7 +635,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 struct bch_inode_unpacked new_inode; bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); + bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); new_inode.bi_size = i_size; new_inode.bi_inum = inum; new_inode.bi_snapshot = snapshot; @@ -1137,6 +1137,10 @@ static int check_inode(struct btree_trans *trans, goto err; } + ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); + if (ret) + goto err; + if (u.bi_dir || u.bi_dir_offset) { ret = check_inode_dirent_inode(trans, &u, &do_update); if (ret) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 13c1e9df252a..5cf70108ae2f 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -908,7 +908,8 @@ void bch2_inode_init_early(struct bch_fs *c, get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); } -void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, +void bch2_inode_init_late(struct bch_fs *c, + struct bch_inode_unpacked *inode_u, u64 now, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct bch_inode_unpacked *parent) { @@ -935,6 +936,9 @@ void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, if (!S_ISDIR(mode)) inode_u->bi_casefold = 0; + + if (bch2_inode_casefold(c, inode_u)) + inode_u->bi_flags |= BCH_INODE_has_case_insensitive; } void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, @@ -942,7 +946,7 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, struct bch_inode_unpacked *parent) { bch2_inode_init_early(c, inode_u); - bch2_inode_init_late(inode_u, bch2_current_time(c), + bch2_inode_init_late(c, inode_u, bch2_current_time(c), uid, gid, mode, rdev, parent); } @@ -1279,7 +1283,7 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, bi->bi_casefold = v + 1; bi->bi_fields_set |= BIT(Inode_opt_casefold); - return 0; + return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); #else bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); return -EOPNOTSUPP; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index c31567c09b8a..77ad2d549541 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -164,7 +164,7 @@ int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); -void bch2_inode_init_late(struct bch_inode_unpacked *, u64, +void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 87e193e8ed25..1f00938b1bdc 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -129,6 +129,10 @@ enum inode_opt_id { Inode_opt_nr, }; +/* + * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive - + * for overlayfs + */ #define BCH_INODE_FLAGS() \ x(sync, 0) \ x(immutable, 1) \ @@ -139,7 +143,8 @@ enum inode_opt_id { x(i_sectors_dirty, 6) \ x(unlinked, 7) \ x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) + x(has_child_snapshot, 9) \ + x(has_case_insensitive, 10) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 8088e810815f..bd093ce56ad9 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -11,6 +11,14 @@ #include +static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode) +{ + return (subvol_inum) { + .subvol = inode->bi_parent_subvol ?: inum.subvol, + .inum = inode->bi_dir, + }; +} + static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) { return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; @@ -49,7 +57,7 @@ int bch2_create_trans(struct btree_trans *trans, if (!(flags & BCH_CREATE_SNAPSHOT)) { /* Normal create path - allocate a new inode: */ - bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u); if (flags & BCH_CREATE_TMPFILE) new_inode->bi_flags |= BCH_INODE_unlinked; @@ -510,6 +518,13 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } + ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?: + (mode == BCH_RENAME_EXCHANGE + ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u) + : 0); + if (ret) + goto err; + if (is_subdir_for_nlink(src_inode_u)) { src_dir_u->bi_nlink--; dst_dir_u->bi_nlink++; @@ -611,8 +626,7 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb goto disconnected; } - inum.subvol = inode.bi_parent_subvol ?: inum.subvol; - inum.inum = inode.bi_dir; + inum = parent_inum(inum, &inode); u32 snapshot; ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); @@ -847,3 +861,149 @@ int __bch2_check_dirent_target(struct btree_trans *trans, bch_err_fn(c, ret); return ret; } + +/* + * BCH_INODE_has_case_insensitive: + * We have to track whether directories have any descendent directory that is + * casefolded - for overlayfs: + */ + +static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum) +{ + struct btree_iter iter = {}; + int ret = 0; + + while (true) { + struct bch_inode_unpacked inode; + ret = bch2_inode_peek(trans, &iter, &inode, inum, + BTREE_ITER_intent|BTREE_ITER_with_updates); + if (ret) + break; + + if (inode.bi_flags & BCH_INODE_has_case_insensitive) + break; + + inode.bi_flags |= BCH_INODE_has_case_insensitive; + ret = bch2_inode_write(trans, &iter, &inode); + if (ret) + break; + + bch2_trans_iter_exit(trans, &iter); + if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) + break; + + inum = parent_inum(inum, &inode); + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + if (!bch2_inode_casefold(trans->c, inode)) + return 0; + + inode->bi_flags |= BCH_INODE_has_case_insensitive; + + return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode)); +} + +int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + snapshot_id_list *snapshot_overwrites, + bool *do_update) +{ + struct printbuf buf = PRINTBUF; + bool repairing_parents = false; + int ret = 0; + + if (!S_ISDIR(inode->bi_mode)) { + /* + * Old versions set bi_casefold for non dirs, but that's + * unnecessary and wasteful + */ + if (inode->bi_casefold) { + inode->bi_casefold = 0; + *do_update = true; + } + return 0; + } + + if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive) + return 0; + + if (bch2_inode_casefold(trans->c, inode) && + !(inode->bi_flags & BCH_INODE_has_case_insensitive)) { + prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ", + inode->bi_inum, inode->bi_snapshot); + + ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, + snapshot_overwrites, &buf); + if (ret) + goto err; + + if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) { + inode->bi_flags |= BCH_INODE_has_case_insensitive; + *do_update = true; + } + } + + if (!(inode->bi_flags & BCH_INODE_has_case_insensitive)) + goto out; + + struct bch_inode_unpacked dir = *inode; + u32 snapshot = dir.bi_snapshot; + + while (!(dir.bi_inum == BCACHEFS_ROOT_INO && + dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + if (dir.bi_parent_subvol) { + ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot); + if (ret) + goto err; + + snapshot_overwrites = NULL; + } + + ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0); + if (ret) + goto err; + + if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) { + prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n"); + + ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, + snapshot_overwrites, &buf); + if (ret) + goto err; + + if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) { + dir.bi_flags |= BCH_INODE_has_case_insensitive; + ret = __bch2_fsck_write_inode(trans, &dir); + if (ret) + goto err; + } + } + + /* + * We only need to check the first parent, unless we find an + * inconsistency + */ + if (!repairing_parents) + break; + } +out: +err: +fsck_err: + printbuf_exit(&buf); + if (ret) + return ret; + + if (repairing_parents) { + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + } + + return 0; +} diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h index d4d2d2d69517..ae6ebc2d0785 100644 --- a/fs/bcachefs/namei.h +++ b/fs/bcachefs/namei.h @@ -71,4 +71,9 @@ static inline int bch2_check_dirent_target(struct btree_trans *trans, return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); } +int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *, + snapshot_id_list *, bool *); + #endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 296c6c925386..861fce1630f0 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -100,7 +100,11 @@ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ x(stripe_backpointers, \ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_ptr_to_missing_backpointer) + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(inode_has_case_insensitive, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ + BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ From 2faa8ab0d03cd4e619024a257f69b7e51ac82c15 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 20 May 2025 20:15:39 -0400 Subject: [PATCH 194/218] bcachefs: fix duplicate printk Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index c08bc6685078..cde7dd115267 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -123,8 +123,6 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, prt_printf(&buf, "for "); bch2_bkey_val_to_text(&buf, c, orig_k); - - bch_err(c, "%s", buf.buf); } else if (!will_check) { prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); From f638b84224348dd58a348617e6f648e967e1b1ce Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 20 May 2025 22:59:58 -0400 Subject: [PATCH 195/218] bcachefs: fix bch2_inum_snapshot_to_path() Signed-off-by: Kent Overstreet --- fs/bcachefs/namei.c | 50 +++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index bd093ce56ad9..a84b69d6caef 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -609,29 +609,39 @@ static inline void reverse_bytes(void *b, size_t n) } } -/* XXX: we don't yet attempt to print paths when we don't know the subvol */ -int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) +static int __bch2_inum_to_path(struct btree_trans *trans, + u32 subvol, u64 inum, u32 snapshot, + struct printbuf *path) { unsigned orig_pos = path->pos; int ret = 0; - while (!subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) { + while (true) { + if (!snapshot) { + ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); + if (ret) + goto disconnected; + } + struct bch_inode_unpacked inode; - ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); + ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); if (ret) goto disconnected; + if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL && + inode.bi_inum == BCACHEFS_ROOT_INO) + break; + if (!inode.bi_dir && !inode.bi_dir_offset) { ret = -BCH_ERR_ENOENT_inode_no_backpointer; goto disconnected; } - inum = parent_inum(inum, &inode); - - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto disconnected; + inum = inode.bi_dir; + if (inode.bi_parent_subvol) { + subvol = inode.bi_parent_subvol; + snapshot = 0; + } struct btree_iter d_iter; struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, @@ -668,22 +678,18 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb goto out; } +int bch2_inum_to_path(struct btree_trans *trans, + subvol_inum inum, + struct printbuf *path) +{ + return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path); +} + int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot, snapshot_id_list *snapshot_overwrites, struct printbuf *path) { - u32 subvol = bch2_snapshot_oldest_subvol(trans->c, snapshot, snapshot_overwrites); - int ret = 0; - - if (subvol) { - ret = bch2_inum_to_path(trans, (subvol_inum) { subvol, inum }, path); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - } - - if (!subvol || ret) - prt_printf(path, "inum %llu:%u", inum, snapshot); - return 0; + return __bch2_inum_to_path(trans, 0, inum, snapshot, path); } /* fsck */ From 136d082abc2adcdc10a472e29710826eee7f5f80 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 May 2025 03:19:18 -0400 Subject: [PATCH 196/218] bcachefs: Improve trace_trans_restart_upgrade - Convert to a 'fs_str' tracepoint that just emits as a string: this lets us build up the tracepoint with a printbuf, using our pretty printers, and they're much easier to manage - Include locks_held, before and after - Include the btree node pointer we failed on (error pointer, null, or real node) Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 23 ++++++++++++++++-- fs/bcachefs/trace.h | 48 +++---------------------------------- 2 files changed, 24 insertions(+), 47 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 6663e186a960..59a366fdd24c 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -631,6 +631,7 @@ int __bch2_btree_path_upgrade(struct btree_trans *trans, unsigned new_locks_want) { struct get_locks_fail f = {}; + unsigned old_locks = path->nodes_locked; unsigned old_locks_want = path->locks_want; int ret = 0; @@ -670,8 +671,26 @@ int __bch2_btree_path_upgrade(struct btree_trans *trans, } } - trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, - old_locks_want, new_locks_want, &f); + count_event(trans->c, trans_restart_upgrade); + if (trace_trans_restart_upgrade_enabled()) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_); + prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id)); + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, "locks want %u -> %u level %u\n", + old_locks_want, new_locks_want, f.l); + prt_printf(&buf, "nodes_locked %x -> %x\n", + old_locks, path->nodes_locked); + prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) : + !f.b ? "(null)" : "(node)"); + prt_printf(&buf, "path seq %u node seq %u\n", + IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq, + path->l[f.l].lock_seq); + + trace_trans_restart_upgrade(trans->c, buf.buf); + printbuf_exit(&buf); + } ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); out: bch2_trans_verify_locks(trans); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index a31024f082f3..8cb5b40704fd 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1127,51 +1127,9 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); -TRACE_EVENT(trans_restart_upgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned old_locks_want, - unsigned new_locks_want, - struct get_locks_fail *f), - TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, old_locks_want ) - __field(u8, new_locks_want ) - __field(u8, level ) - __field(u32, path_seq ) - __field(u32, node_seq ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - __entry->old_locks_want = old_locks_want; - __entry->new_locks_want = new_locks_want; - __entry->level = f->l; - __entry->path_seq = path->l[f->l].lock_seq; - __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; - TRACE_BPOS_assign(pos, path->pos) - ), - - TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->old_locks_want, - __entry->new_locks_want, - __entry->level, - __entry->path_seq, - __entry->node_seq) +DEFINE_EVENT(fs_str, trans_restart_upgrade, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(trans_str, trans_restart_relock, From bfc0c6fecf3bd2da93beb565ccfb9e704cadddcc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 May 2025 15:54:56 -0400 Subject: [PATCH 197/218] bcachefs: Drop empty accounting updates Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 488c342b9cd7..b3840ff7c407 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -111,6 +111,16 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, if (bpos_eq(a->k.p, pos)) { BUG_ON(nr != bch2_accounting_counters(&a->k)); acc_u64s(a->v.d, d, nr); + + if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { + unsigned offset = (u64 *) a - + (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); + + trans->accounting.u64s -= a->k.u64s; + memmove_u64s_down(a, + bkey_next(&a->k_i), + trans->accounting.u64s - offset); + } return 0; } #endif From 4a9eb20efa9f07b54382bb2713439fc09336d45a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 12:34:40 -0400 Subject: [PATCH 198/218] bcachefs: Kill bkey_buf usage in data_update_index_update() Reduce stack usage - bkey_buf has a 96 byte buffer on the stack, but the btree_trans bump allocator works just fine here. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index de096ca65b4b..ef648a6d9c52 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -187,14 +187,9 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, struct data_update *m = container_of(op, struct data_update, op); struct keylist *keys = &op->insert_keys; - struct bkey_buf _new, _insert; struct printbuf journal_msg = PRINTBUF; int ret = 0; - bch2_bkey_buf_init(&_new); - bch2_bkey_buf_init(&_insert); - bch2_bkey_buf_realloc(&_insert, c, U8_MAX); - bch2_trans_iter_init(trans, &iter, m->btree_id, bkey_start_pos(&bch2_keylist_front(keys)->k), BTREE_ITER_slots|BTREE_ITER_intent); @@ -229,11 +224,22 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, goto nowork; } - bkey_reassemble(_insert.k, k); - insert = _insert.k; + insert = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + + bkey_val_bytes(&new->k) + + sizeof(struct bch_extent_rebalance)); + ret = PTR_ERR_OR_ZERO(insert); + if (ret) + goto err; - bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); - new = bkey_i_to_extent(_new.k); + bkey_reassemble(insert, k); + + new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + bkey_copy(&new->k_i, bch2_keylist_front(keys)); bch2_cut_front(iter.pos, &new->k_i); bch2_cut_front(iter.pos, insert); @@ -457,8 +463,6 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, out: printbuf_exit(&journal_msg); bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&_insert, c); - bch2_bkey_buf_exit(&_new, c); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); return ret; } From 7d886a82bf9cb2b3b0e591a915c89c4696598149 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 12:49:56 -0400 Subject: [PATCH 199/218] bcachefs: bch2_trans_log_str() The data update path doesn't need a printbuf for its log message - this will help reduce stack usage. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 26 ++++++++++++++++++-------- fs/bcachefs/btree_update.h | 1 + 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 20fba8d17431..5dac09c98026 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -828,25 +828,35 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, return bch2_trans_update_buffered(trans, btree, &k); } -int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) +static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len) { - unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); - - int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - return ret; + unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - ret = PTR_ERR_OR_ZERO(e); + int ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy_and_pad(l->d, u64s * sizeof(u64), buf->buf, buf->pos, 0); + memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0); return 0; } +int bch2_trans_log_str(struct btree_trans *trans, const char *str) +{ + return __bch2_trans_log_str(trans, str, strlen(str)); +} + +int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) +{ + int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + return ret; + + return __bch2_trans_log_str(trans, buf->buf, buf->pos); +} + int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_i *k) { diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index a54dc7277177..f907eaa8b185 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -205,6 +205,7 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *, unsigned); +int bch2_trans_log_str(struct btree_trans *, const char *); int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *); From d385ca5603a3af3c5cc85d16e42e6063a257ea55 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 12:50:22 -0400 Subject: [PATCH 200/218] bcachefs: Reduce stack usage in data_update_index_update() Separate tracepoint message generation and other slowpath code into non-inline functions, and use bch2_trans_log_str() instead of using a printbuf for our journal message. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 154 +++++++++++++++++++++----------------- 1 file changed, 87 insertions(+), 67 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index ef648a6d9c52..c34e5b88ba9d 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -100,9 +100,10 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc return true; } -static noinline void trace_io_move_finish2(struct data_update *u, - struct bkey_i *new, - struct bkey_i *insert) +noinline_for_stack +static void trace_io_move_finish2(struct data_update *u, + struct bkey_i *new, + struct bkey_i *insert) { struct bch_fs *c = u->op.c; struct printbuf buf = PRINTBUF; @@ -124,6 +125,7 @@ static noinline void trace_io_move_finish2(struct data_update *u, printbuf_exit(&buf); } +noinline_for_stack static void trace_io_move_fail2(struct data_update *m, struct bkey_s_c new, struct bkey_s_c wrote, @@ -179,19 +181,84 @@ static void trace_io_move_fail2(struct data_update *m, printbuf_exit(&buf); } +noinline_for_stack +static void trace_data_update2(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_data_update(c, buf.buf); + printbuf_exit(&buf); +} + +noinline_for_stack +static void trace_io_move_created_rebalance2(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + struct printbuf buf = PRINTBUF; + + bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_io_move_created_rebalance(c, buf.buf); + printbuf_exit(&buf); + + this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); +} + +noinline_for_stack +static int data_update_invalid_bkey(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_str(&buf, "about to insert invalid key in data update path"); + prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + + return -BCH_ERR_invalid_bkey; +} + static int __bch2_data_update_index_update(struct btree_trans *trans, struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_iter iter; - struct data_update *m = - container_of(op, struct data_update, op); - struct keylist *keys = &op->insert_keys; - struct printbuf journal_msg = PRINTBUF; + struct data_update *m = container_of(op, struct data_update, op); int ret = 0; bch2_trans_iter_init(trans, &iter, m->btree_id, - bkey_start_pos(&bch2_keylist_front(keys)->k), + bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k), BTREE_ITER_slots|BTREE_ITER_intent); while (1) { @@ -216,11 +283,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (ret) goto err; - new = bkey_i_to_extent(bch2_keylist_front(keys)); + new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); if (!bch2_extents_match(k, old)) { trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), - NULL, "no match:"); + NULL, "no match:"); goto nowork; } @@ -239,7 +306,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (ret) goto err; - bkey_copy(&new->k_i, bch2_keylist_front(keys)); + bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys)); bch2_cut_front(iter.pos, &new->k_i); bch2_cut_front(iter.pos, insert); @@ -353,31 +420,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, .flags = BCH_VALIDATE_commit, }); if (unlikely(invalid)) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_str(&buf, "about to insert invalid key in data update path"); - prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - ret = -BCH_ERR_invalid_bkey; + ret = data_update_invalid_bkey(m, old, k, insert); goto out; } - printbuf_reset(&journal_msg); - prt_str(&journal_msg, bch2_data_update_type_strs[m->type]); - - ret = bch2_trans_log_msg(trans, &journal_msg) ?: + ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: @@ -389,38 +436,12 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (ret) goto err; - if (trace_data_update_enabled()) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_data_update(c, buf.buf); - printbuf_exit(&buf); - } + if (trace_data_update_enabled()) + trace_data_update2(m, old, k, insert); if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > - bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) { - struct printbuf buf = PRINTBUF; - - bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_io_move_created_rebalance(c, buf.buf); - printbuf_exit(&buf); - - this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); - } + bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) + trace_io_move_created_rebalance2(m, old, k, insert); ret = bch2_trans_commit(trans, &op->res, NULL, @@ -441,9 +462,9 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (ret) break; next: - while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { - bch2_keylist_pop_front(keys); - if (bch2_keylist_empty(keys)) + while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) { + bch2_keylist_pop_front(&op->insert_keys); + if (bch2_keylist_empty(&op->insert_keys)) goto out; } continue; @@ -461,7 +482,6 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, goto next; } out: - printbuf_exit(&journal_msg); bch2_trans_iter_exit(trans, &iter); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); return ret; From 2a6c0136ae9ac44f2b097e469d4cee95cd11e4f8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 May 2025 21:54:35 -0400 Subject: [PATCH 201/218] bcachefs: bch2_journal_write_checksum() We need to delay checksumming the journal write; we don't know the blocksize until after we allocate the write. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 06f7b018492c..63bb207208b2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1864,9 +1864,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) struct jset_entry *start, *end; struct jset *jset = w->data; struct journal_keys_to_wb wb = { NULL }; - unsigned sectors, bytes, u64s; + unsigned u64s; unsigned long btree_roots_have = 0; - bool validate_before_checksum = false; u64 seq = le64_to_cpu(jset->seq); int ret; @@ -1949,8 +1948,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) le32_add_cpu(&jset->u64s, u64s); - sectors = vstruct_sectors(jset, c->block_bits); - bytes = vstruct_bytes(jset); + unsigned sectors = vstruct_sectors(jset, c->block_bits); if (sectors > w->sectors) { bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", @@ -1959,6 +1957,17 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) return -EINVAL; } + return 0; +} + +static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct jset *jset = w->data; + u64 seq = le64_to_cpu(jset->seq); + bool validate_before_checksum = false; + int ret = 0; + jset->magic = cpu_to_le64(jset_magic(c)); jset->version = cpu_to_le32(c->sb.version); @@ -1981,7 +1990,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); - if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) + if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret))) return ret; jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), @@ -1991,6 +2000,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) (ret = jset_validate(c, NULL, jset, 0, WRITE))) return ret; + unsigned sectors = vstruct_sectors(jset, c->block_bits); + unsigned bytes = vstruct_bytes(jset); memset((void *) jset + bytes, 0, (sectors << 9) - bytes); return 0; } @@ -2088,6 +2099,10 @@ CLOSURE_CALLBACK(bch2_journal_write) if (unlikely(ret)) goto err_allocate_write; + ret = bch2_journal_write_checksum(j, w); + if (unlikely(ret)) + goto err; + spin_lock(&j->lock); /* * write is allocated, no longer need to account for it in From 659489f37bd0471d5a77abdfe86eb105ad11297e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 15:40:24 -0400 Subject: [PATCH 202/218] bcachefs: Kill bch2_path_put_nokeep() bch2_path_put_nokeep() was intended for paths we wouldn't need to preserve for a transaction restart - it always frees them right away when the ref hits 0. But since paths are shared, freeing unconditionally is a bug, the path might have been used elsewhere and have should_be_locked set, i.e. we need to keep it locked until the end of the transaction. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 0f0b80c8c29a..b366407878d0 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1431,15 +1431,6 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in __bch2_path_free(trans, path_idx); } -static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, - bool intent) -{ - if (!__btree_path_put(trans, trans->paths + path, intent)) - return; - - __bch2_path_free(trans, path); -} - void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) { panic("trans->restart_count %u, should be %u, last restarted by %pS\n", @@ -2358,8 +2349,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree } if (iter->update_path) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); iter->update_path = 0; } @@ -2388,8 +2378,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree if (iter->update_path && !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } @@ -2648,7 +2638,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct * the last possible snapshot overwrite, return * it: */ - bch2_path_put_nokeep(trans, iter->path, + bch2_path_put(trans, iter->path, iter->flags & BTREE_ITER_intent); iter->path = saved_path; saved_path = 0; @@ -2678,8 +2668,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct * our previous saved candidate: */ if (saved_path) { - bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, saved_path, + iter->flags & BTREE_ITER_intent); saved_path = 0; } @@ -2722,7 +2712,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct iter->pos.snapshot = iter->snapshot; out_no_locked: if (saved_path) - bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(trans, iter); @@ -3045,7 +3035,7 @@ static inline void btree_path_list_add(struct btree_trans *trans, void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { if (iter->update_path) - bch2_path_put_nokeep(trans, iter->update_path, + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); if (iter->path) bch2_path_put(trans, iter->path, From 5b7b342c402df2cfb1d9a8ea79613742d61d1293 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 18:03:32 -0400 Subject: [PATCH 203/218] bcachefs: btree_node_locked_type_nowrite() Small helper to improve locking assertions. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 7 ++----- fs/bcachefs/btree_locking.h | 13 +++++++++++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 59a366fdd24c..4745c2035d24 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -877,14 +877,11 @@ void __bch2_btree_path_verify_locks(struct btree_path *path) for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { int want = btree_lock_want(path, l); - int have = btree_node_locked_type(path, l); + int have = btree_node_locked_type_nowrite(path, l); BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); - BUG_ON(is_btree_node(path, l) && - (want == BTREE_NODE_UNLOCKED || - have != BTREE_NODE_WRITE_LOCKED) && - want != have); + BUG_ON(is_btree_node(path, l) && want != have); BUG_ON(btree_node_locked(path, l) && path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 1bb28e21d021..7e162982de17 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -43,6 +43,15 @@ static inline int btree_node_locked_type(struct btree_path *path, return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); } +static inline int btree_node_locked_type_nowrite(struct btree_path *path, + unsigned level) +{ + int have = btree_node_locked_type(path, level); + return have == BTREE_NODE_WRITE_LOCKED + ? BTREE_NODE_INTENT_LOCKED + : have; +} + static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) { return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; @@ -366,8 +375,8 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, struct btree_path *path, unsigned level) { EBUG_ON(btree_node_locked(path, level) && - !btree_node_write_locked(path, level) && - btree_node_locked_type(path, level) != __btree_lock_want(path, level)); + btree_node_locked_type_nowrite(path, level) != + __btree_lock_want(path, level)); return likely(btree_node_locked(path, level)) || (!IS_ERR_OR_NULL(path->l[level].b) && From 66782b2acbc3291faba7e14d9b22b77a4f3f94e4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 16:54:31 -0400 Subject: [PATCH 204/218] bcachefs: Fix btree_path_get_locks when not doing trans restart btree_path_get_locks, on failure, shouldn't unlock if we're not issuing a transaction restart: we might drop locks we're not supposed to (if path->should_be_locked is set). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 2 +- fs/bcachefs/btree_locking.c | 108 ++++++++++++++++++++---------------- fs/bcachefs/btree_locking.h | 13 ++++- 3 files changed, 72 insertions(+), 51 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index b366407878d0..831275f8e79f 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1799,7 +1799,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, locks_want = min(locks_want, BTREE_MAX_DEPTH); if (locks_want > path->locks_want) - bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); + bch2_btree_path_upgrade_norestart(trans, path, locks_want); return path_idx; } diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 4745c2035d24..6e43269a9c47 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -451,13 +451,13 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, /* relock */ -static inline bool btree_path_get_locks(struct btree_trans *trans, - struct btree_path *path, - bool upgrade, - struct get_locks_fail *f) +static int btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, + bool upgrade, + struct get_locks_fail *f, + int restart_err) { unsigned l = path->level; - int fail_idx = -1; do { if (!btree_path_node(path, l)) @@ -465,39 +465,49 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, if (!(upgrade ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) { - fail_idx = l; - - if (f) { - f->l = l; - f->b = path->l[l].b; - } - } + : bch2_btree_node_relock(trans, path, l))) + goto err; l++; } while (l < path->locks_want); + if (path->uptodate == BTREE_ITER_NEED_RELOCK) + path->uptodate = BTREE_ITER_UPTODATE; + + return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1; +err: + if (f) { + f->l = l; + f->b = path->l[l].b; + } + + /* + * Do transaction restart before unlocking, so we don't pop + * should_be_locked asserts + */ + if (restart_err) { + btree_trans_restart(trans, restart_err); + } else if (path->should_be_locked && !trans->restarted) { + if (upgrade) + path->locks_want = l; + return -1; + } + + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + /* * When we fail to get a lock, we have to ensure that any child nodes * can't be relocked so bch2_btree_path_traverse has to walk back up to * the node that we failed to relock: */ - if (fail_idx >= 0) { - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + do { + path->l[l].b = upgrade + ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) + : ERR_PTR(-BCH_ERR_no_btree_node_relock); + } while (l--); - do { - path->l[fail_idx].b = upgrade - ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) - : ERR_PTR(-BCH_ERR_no_btree_node_relock); - --fail_idx; - } while (fail_idx >= 0); - } - - if (path->uptodate == BTREE_ITER_NEED_RELOCK) - path->uptodate = BTREE_ITER_UPTODATE; - - return path->uptodate < BTREE_ITER_NEED_RELOCK; + return -restart_err ?: -1; } bool __bch2_btree_node_relock(struct btree_trans *trans, @@ -596,9 +606,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, __flatten bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) { - struct get_locks_fail f; - - bool ret = btree_path_get_locks(trans, path, false, &f); + bool ret = !btree_path_get_locks(trans, path, false, NULL, 0); bch2_trans_verify_locks(trans); return ret; } @@ -614,15 +622,16 @@ int __bch2_btree_path_relock(struct btree_trans *trans, return 0; } -bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want, - struct get_locks_fail *f) +bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { - path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); + path->locks_want = new_locks_want; - bool ret = btree_path_get_locks(trans, path, true, f); - bch2_trans_verify_locks(trans); + struct get_locks_fail f = {}; + bool ret = !btree_path_get_locks(trans, path, true, &f, 0); + + bch2_btree_path_verify_locks(path); return ret; } @@ -630,12 +639,15 @@ int __bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - struct get_locks_fail f = {}; unsigned old_locks = path->nodes_locked; unsigned old_locks_want = path->locks_want; - int ret = 0; - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, &f)) + path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); + + struct get_locks_fail f = {}; + int ret = btree_path_get_locks(trans, path, true, &f, + BCH_ERR_transaction_restart_upgrade); + if (!ret) goto out; /* @@ -667,7 +679,7 @@ int __bch2_btree_path_upgrade(struct btree_trans *trans, linked->btree_id == path->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true, NULL); + btree_path_get_locks(trans, linked, true, NULL, 0); } } @@ -691,7 +703,6 @@ int __bch2_btree_path_upgrade(struct btree_trans *trans, trace_trans_restart_upgrade(trans->c, buf.buf); printbuf_exit(&buf); } - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); out: bch2_trans_verify_locks(trans); return ret; @@ -752,7 +763,7 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans) __bch2_btree_path_unlock(trans, path); } -static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, +static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, struct get_locks_fail *f, bool trace) { if (!trace) @@ -786,7 +797,6 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str out: __bch2_trans_unlock(trans); bch2_trans_verify_locks(trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) @@ -803,10 +813,14 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) trans_for_each_path(trans, path, i) { struct get_locks_fail f; + int ret; if (path->should_be_locked && - !btree_path_get_locks(trans, path, false, &f)) - return bch2_trans_relock_fail(trans, path, &f, trace); + (ret = btree_path_get_locks(trans, path, false, &f, + BCH_ERR_transaction_restart_relock))) { + bch2_trans_relock_fail(trans, path, &f, trace); + return ret; + } } trans_set_locked(trans, true); diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 7e162982de17..63d7e5fb77c8 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -385,9 +385,16 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ -bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, - struct btree_path *, unsigned, - struct get_locks_fail *); +bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned); + +static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + return new_locks_want > path->locks_want + ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want) + : true; +} int __bch2_btree_path_upgrade(struct btree_trans *, struct btree_path *, unsigned); From aac49471b6c4a15cdb4bdade8c19527075af073d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 18:00:45 -0400 Subject: [PATCH 205/218] bcachefs: Give out new path if upgrade fails Avoid transaction restarts due to failure to upgrade - we can traverse a new iterator without a transaction restart. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 831275f8e79f..cae0fa60434b 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1740,6 +1740,10 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, btree_trans_sort_paths(trans); + if (intent) + locks_want = max(locks_want, level + 1); + locks_want = min(locks_want, BTREE_MAX_DEPTH); + trans_for_each_path_inorder(trans, path, iter) { if (__btree_path_cmp(path, btree_id, @@ -1754,7 +1758,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, if (path_pos && trans->paths[path_pos].cached == cached && trans->paths[path_pos].btree_id == btree_id && - trans->paths[path_pos].level == level) { + trans->paths[path_pos].level == level && + bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) { trace_btree_path_get(trans, trans->paths + path_pos, &pos); __btree_path_get(trans, trans->paths + path_pos, intent); @@ -1786,9 +1791,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, if (!(flags & BTREE_ITER_nopreserve)) path->preserve = true; - if (path->intent_ref) - locks_want = max(locks_want, level + 1); - /* * If the path has locks_want greater than requested, we don't downgrade * it here - on transaction restart because btree node split needs to @@ -1797,10 +1799,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, * a successful transaction commit. */ - locks_want = min(locks_want, BTREE_MAX_DEPTH); - if (locks_want > path->locks_want) - bch2_btree_path_upgrade_norestart(trans, path, locks_want); - return path_idx; } From be9fecdcdaf730dbf2ca70dbe5b6d42922df50d6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 18:12:54 -0400 Subject: [PATCH 206/218] bcachefs: bch2_path_get() reuses paths if upgrade_fails & !should_be_locked Small additional optimization over the previous patch, bringing us closer to the original behaviour, except when we need to clone to avoid a transaction restart. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 6e43269a9c47..78f485ed1746 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -628,8 +628,13 @@ bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans, { path->locks_want = new_locks_want; - struct get_locks_fail f = {}; - bool ret = !btree_path_get_locks(trans, path, true, &f, 0); + /* + * If we need it locked, we can't touch it. Otherwise, we can return + * success - bch2_path_get() will use this path, and it'll just be + * retraversed: + */ + bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) || + !path->should_be_locked; bch2_btree_path_verify_locks(path); return ret; From eb34365adae033659384d1dedae99f73abd9815a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 16:03:08 -0400 Subject: [PATCH 207/218] bcachefs: Clear should_be_locked before unlock in key_cache_drop() We're adding new should_be_locked assertions, also add a comment explaining why clearing should_be_locked is safe here. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index b8efe2fddbc4..9948d0e4d442 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -646,9 +646,16 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, unsigned i; trans_for_each_path(trans, path2, i) if (path2->l[0].b == (void *) ck) { + /* + * It's safe to clear should_be_locked here because + * we're evicting from the key cache, and we still have + * the underlying btree locked: filling into the key + * cache would require taking a write lock on the btree + * node + */ + path2->should_be_locked = false; __bch2_btree_path_unlock(trans, path2); path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); - path2->should_be_locked = false; btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); } From df92f3500b3f78b8e0ed3faa95c15a834ea9a821 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 16:04:15 -0400 Subject: [PATCH 208/218] bcachefs: Clear trans->locked before unlock We're adding new should_be_locked assertions: it's going to be illegal to unlock a should_be_locked path when trans->locked is true. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 78f485ed1746..826930b4b164 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -846,9 +846,9 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) void bch2_trans_unlock(struct btree_trans *trans) { - __bch2_trans_unlock(trans); - trans_set_unlocked(trans); + + __bch2_trans_unlock(trans); } void bch2_trans_unlock_long(struct btree_trans *trans) From 80a160e49414972b712f30b9b287d88197fe3077 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 15:33:14 -0400 Subject: [PATCH 209/218] bcachefs: Plumb btree_trans for more locking asserts Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 12 ++++++------ fs/bcachefs/btree_iter.h | 3 ++- fs/bcachefs/btree_key_cache.c | 2 +- fs/bcachefs/btree_locking.c | 12 ++++++------ fs/bcachefs/btree_locking.h | 11 ++++++----- 5 files changed, 21 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index cae0fa60434b..51ff452562af 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -228,7 +228,7 @@ static void __bch2_btree_path_verify(struct btree_trans *trans, __bch2_btree_path_verify_level(trans, path, i); } - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); } void __bch2_trans_verify_paths(struct btree_trans *trans) @@ -991,7 +991,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, path->level = level; bch2_btree_path_level_init(trans, path, b); - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); err: bch2_bkey_buf_exit(&tmp, c); return ret; @@ -1103,7 +1103,7 @@ static void btree_path_set_level_down(struct btree_trans *trans, if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) btree_node_unlock(trans, path, l); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); bch2_btree_path_verify(trans, path); } @@ -1301,7 +1301,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, if (unlikely(path->cached)) { btree_node_unlock(trans, path, 0); path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); goto out; } @@ -1330,7 +1330,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, } if (unlikely(level != path->level)) { - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); __bch2_btree_path_unlock(trans, path); } out: @@ -1984,7 +1984,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ __bch2_btree_path_unlock(trans, path); path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); goto err; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index cafd35a5e7a3..7cb2c38b70c0 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -46,7 +46,8 @@ static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path return --path->ref == 0; } -static inline void btree_path_set_dirty(struct btree_path *path, +static inline void btree_path_set_dirty(struct btree_trans *trans, + struct btree_path *path, enum btree_path_uptodate u) { path->uptodate = max_t(unsigned, path->uptodate, u); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 9948d0e4d442..9da950e7eb7d 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -656,7 +656,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, path2->should_be_locked = false; __bch2_btree_path_unlock(trans, path2); path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); - btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE); } bch2_trans_verify_locks(trans); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 826930b4b164..2cdc9a04f3e8 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -494,7 +494,7 @@ static int btree_path_get_locks(struct btree_trans *trans, } __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); /* * When we fail to get a lock, we have to ensure that any child nodes @@ -594,7 +594,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, l++) { if (!bch2_btree_node_relock(trans, path, l)) { __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); } @@ -636,7 +636,7 @@ bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans, bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) || !path->should_be_locked; - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); return ret; } @@ -739,7 +739,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, } } - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); } @@ -880,7 +880,7 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, /* Debug */ -void __bch2_btree_path_verify_locks(struct btree_path *path) +void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path) { /* * A path may be uptodate and yet have nothing locked if and only if @@ -929,5 +929,5 @@ void __bch2_trans_verify_locks(struct btree_trans *trans) unsigned i; trans_for_each_path(trans, path, i) - __bch2_btree_path_verify_locks(path); + __bch2_btree_path_verify_locks(trans, path); } diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 63d7e5fb77c8..9adca77e2580 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -160,7 +160,7 @@ static inline int btree_path_highest_level_locked(struct btree_path *path) static inline void __bch2_btree_path_unlock(struct btree_trans *trans, struct btree_path *path) { - btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK); while (path->nodes_locked) btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); @@ -433,7 +433,7 @@ static inline void btree_path_set_level_up(struct btree_trans *trans, struct btree_path *path) { __btree_path_set_level_up(trans, path, path->level++); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); } /* debug */ @@ -445,13 +445,14 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); -void __bch2_btree_path_verify_locks(struct btree_path *); +void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *); void __bch2_trans_verify_locks(struct btree_trans *); -static inline void bch2_btree_path_verify_locks(struct btree_path *path) +static inline void bch2_btree_path_verify_locks(struct btree_trans *trans, + struct btree_path *path) { if (static_branch_unlikely(&bch2_debug_check_btree_locking)) - __bch2_btree_path_verify_locks(path); + __bch2_btree_path_verify_locks(trans, path); } static inline void bch2_trans_verify_locks(struct btree_trans *trans) From 22e921a6f9b8ec4c9ccbef4accae1494c6695745 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 22 May 2025 15:52:15 -0400 Subject: [PATCH 210/218] bcachefs: Simplify bch2_path_put() Simplify the "do we need to keep this locked?" checks. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 51ff452562af..77b91dd62d95 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1399,35 +1399,44 @@ static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_p void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { - struct btree_path *path = trans->paths + path_idx, *dup; + struct btree_path *path = trans->paths + path_idx, *dup = NULL; if (!__btree_path_put(trans, path, intent)) return; + if (!path->preserve && !path->should_be_locked) + goto free; + dup = path->preserve ? have_path_at_pos(trans, path) : have_node_at_pos(trans, path); - - trace_btree_path_free(trans, path_idx, dup); - - if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) + if (!dup) return; - if (path->should_be_locked && !trans->restarted) { - if (!dup) - return; - + /* + * If we need this path locked, the duplicate also has te be locked + * before we free this one: + */ + if (path->should_be_locked && + !dup->should_be_locked && + !trans->restarted) { if (!(trans->locked ? bch2_btree_path_relock_norestart(trans, dup) : bch2_btree_path_can_relock(trans, dup))) return; + + dup->should_be_locked = true; } - if (dup) { - dup->preserve |= path->preserve; - dup->should_be_locked |= path->should_be_locked; - } + BUG_ON(path->should_be_locked && + !trans->restarted && + trans->locked && + !btree_node_locked(dup, dup->level)); + path->should_be_locked = false; + dup->preserve |= path->preserve; +free: + trace_btree_path_free(trans, path_idx, dup); __bch2_path_free(trans, path_idx); } From b41ac97fe0a6876edbc6fc90dfd05513ba7332ed Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 Apr 2024 23:53:57 -0400 Subject: [PATCH 211/218] bcachefs: Path must be locked if trans->locked && should_be_locked If path->should_be_locked is true, that means user code (of the btree API) has seen, in this transaction, something guarded by the node this path has locked, and we have to keep it locked until the end of the transaction. Assert that we're not violating this; should_be_locked should also be cleared only in _very_ special situations. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 1 + fs/bcachefs/btree_iter.h | 1 + fs/bcachefs/btree_locking.c | 17 +++++++++-------- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 77b91dd62d95..97f3faac8067 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1979,6 +1979,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ /* got to end? */ if (!btree_path_node(path, path->level + 1)) { + path->should_be_locked = false; btree_path_set_level_up(trans, path); return NULL; } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 7cb2c38b70c0..2cabb5f0f484 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -50,6 +50,7 @@ static inline void btree_path_set_dirty(struct btree_trans *trans, struct btree_path *path, enum btree_path_uptodate u) { + BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); path->uptodate = max_t(unsigned, path->uptodate, u); } diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 2cdc9a04f3e8..2f2aed0c9916 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -882,14 +882,15 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path) { - /* - * A path may be uptodate and yet have nothing locked if and only if - * there is no node at path->level, which generally means we were - * iterating over all nodes and got to the end of the btree - */ - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level) && - !path->nodes_locked); + if (!path->nodes_locked && btree_path_node(path, path->level)) { + /* + * A path may be uptodate and yet have nothing locked if and only if + * there is no node at path->level, which generally means we were + * iterating over all nodes and got to the end of the btree + */ + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE); + BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); + } if (!path->nodes_locked) return; From 016c4b48b86d18b14f8a45beabefc5ccf7caf594 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 13:13:44 -0400 Subject: [PATCH 212/218] bcachefs: Fix endianness in casefold check/repair Fixes: 010c89468134 ("bcachefs: Check for casefolded dirents in non casefolded dirs") Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 6 +++--- fs/bcachefs/sb-errors_format.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index dbfa3e0b8abb..49f46df8340e 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2204,11 +2204,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, buf.buf))) { struct qstr name = bch2_dirent_get_name(d); u32 subvol = d.v->d_type == DT_SUBVOL - ? d.v->d_parent_subvol + ? le32_to_cpu(d.v->d_parent_subvol) : 0; u64 target = d.v->d_type == DT_SUBVOL - ? d.v->d_child_subvol - : d.v->d_inum; + ? le32_to_cpu(d.v->d_child_subvol) + : le64_to_cpu(d.v->d_inum); u64 dir_offset; ret = bch2_hash_delete_at(trans, diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 4036a20c6adc..0bfb151da9cf 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -209,7 +209,7 @@ enum bch_fsck_flags { x(subvol_to_missing_root, 188, 0) \ x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ x(bkey_in_missing_snapshot, 190, 0) \ - x(bkey_in_deleted_snapshot, 315, 0) \ + x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \ x(inode_pos_inode_nonzero, 191, 0) \ x(inode_pos_blockdev_range, 192, 0) \ x(inode_alloc_cursor_inode_bad, 301, 0) \ From f351d91edd507391518a4f5870185fa5bf38446b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 18:30:10 -0400 Subject: [PATCH 213/218] bcachefs: Fix allocate -> self healing path When we go to allocate and find taht a bucket in the freespace btree is actually allocated, we're supposed to return nonzero to tell the allocator to skip it. This fixes an emergency read only due to a bucket/ptr gen mismatch - we also don't return the correct bucket gen when this happens. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index a38b9c6c891e..173e81c2bbcb 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1475,6 +1475,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite w->c = c; w->pos = BBPOS(iter->btree_id, iter->pos); queue_work(c->write_ref_wq, &w->work); + + ret = 1; /* don't allocate from this bucket */ goto out; } } From cade003209cfe728de2ef880d5704cc322a7ce1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 18:31:53 -0400 Subject: [PATCH 214/218] bcachefs: Fix opts.recovery_pass_last This was lost in the giant recovery pass rework - but it's used heavily by bcachefs subcommand utilities. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index f74f14227137..dabb29b08ad0 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -525,6 +525,9 @@ int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) c->opts.recovery_passes | c->sb.recovery_passes_required; + if (c->opts.recovery_pass_last) + passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; + /* * We can't allow set_may_go_rw to be excluded; that would cause us to * use the journal replay keys for updates where it's not expected. From 9b133c0d74b17db2dc0d2d70b6591b0ebb604463 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 14:19:25 -0400 Subject: [PATCH 215/218] bcachefs: Small check_fix_ptr fixes We don't want to change the bucket gen, on gen mismatch: it's possible to have multiple btree nodes with different gens in the same bucket that we want to keep, if we have to recover from btree node scan. It's also not necessary to set g->gen_valid; add a comment to that effect. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 8bb6384190c5..09eb5a543ae4 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -156,10 +156,14 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, g->gen_valid = true; g->gen = p.ptr.gen; } else { + /* this pointer will be dropped */ *do_update = true; + goto out; } } + /* g->gen_valid == true */ + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" @@ -172,15 +176,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, if (!p.ptr.cached && (g->data_type != BCH_DATA_btree || data_type == BCH_DATA_btree)) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; + g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; g->cached_sectors = 0; - } else { - *do_update = true; } + + *do_update = true; } if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, @@ -217,9 +219,8 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->gen_valid = true; - g->gen = p.ptr.gen; + if (!p.ptr.cached && + data_type == BCH_DATA_btree) { g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; From 521f9584c2bd48198ac9d9b99a372b1306f3bb97 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 14:03:06 -0400 Subject: [PATCH 216/218] bcachefs: Ensure we don't use a blacklisted journal seq Different versions differ on the size of the blacklist range; it is theoretically possible that we could end up with blacklisted journal sequence numbers newer than the newest seq we find in the journal, and pick a new start seq that's blacklisted. Explicitly check for this in bch2_fs_journal_start(). Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 17 ++++++++++++++++- fs/bcachefs/journal_seq_blacklist.c | 10 ++++++++++ fs/bcachefs/journal_seq_blacklist.h | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index f2963a6cca88..09b70fd140a1 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -415,7 +415,7 @@ static int journal_entry_open(struct journal *j) if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) return -BCH_ERR_journal_max_open; - if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); @@ -459,6 +459,14 @@ static int journal_entry_open(struct journal *j) atomic64_inc(&j->seq); journal_pin_list_init(fifo_push_ref(&j->pin), 1); + if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) { + bch_err(c, "attempting to open blacklisted journal seq %llu", + journal_cur_seq(j)); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); + return -BCH_ERR_journal_shutdown; + } + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); @@ -1415,6 +1423,13 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) bool had_entries = false; u64 last_seq = cur_seq, nr, seq; + /* + * + * XXX pick most recent non blacklisted sequence number + */ + + cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); + if (cur_seq >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); return -EINVAL; diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index e463d2d95359..c5a7d800a0f5 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -130,6 +130,16 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, return true; } +u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t || !t->nr) + return 0; + + return t->entries[eytzinger0_last(t->nr)].end - 1; +} + int bch2_blacklist_table_initialize(struct bch_fs *c) { struct bch_sb_field_journal_seq_blacklist *bl = diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index d47636f96fdc..f06942ccfcdd 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -12,6 +12,7 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) } bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); +u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); int bch2_blacklist_table_initialize(struct bch_fs *); From 3f2f028814abf68ce4d74bfd2627cb84d2afa389 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 20:11:43 -0400 Subject: [PATCH 217/218] bcachefs: Fix btree_iter_next_node() for new locking asserts We can't unlock a should_be_locked path unless we're in a transaction restart. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 97f3faac8067..b4bf4217a3fa 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1991,12 +1991,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ bch2_btree_path_downgrade(trans, path); if (!bch2_btree_node_relock(trans, path, path->level + 1)) { + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); __bch2_btree_path_unlock(trans, path); path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); goto err; } From 9caea9208fc3fbdbd4a41a2de8c6a0c969b030f9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 May 2025 21:59:12 -0400 Subject: [PATCH 218/218] bcachefs: Don't mount bs > ps without TRANSPARENT_HUGEPAGE Large folios aren't supported without TRANSPARENT_HUGEPAGE Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 24658bf450ab..11579b74c640 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -950,6 +950,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_opts_apply(&c->opts, *opts); + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + c->opts.block_size > PAGE_SIZE) { + bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); + ret = -EINVAL; + goto err; + } + c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;