From 0117591e69d1edff46bc87061e533a1e25a8c500 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 30 Nov 2023 23:32:20 -0500
Subject: [PATCH 01/12] bcachefs: Don't drop journal pins in exit path

There's no need to drop journal pins in our exit paths - the code was
trying to have everything cleaned up on any shutdown, but better to just
tweak the assertions a bit.

This fixes a bug where calling into journal reclaim in the exit path
would cass a null ptr deref.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c     | 8 +++-----
 fs/bcachefs/btree_io.c        | 4 ++--
 fs/bcachefs/btree_io.h        | 3 ---
 fs/bcachefs/btree_key_cache.c | 2 --
 4 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 47e7770d0583..79495cd7a794 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -9,6 +9,7 @@
 #include "debug.h"
 #include "errcode.h"
 #include "error.h"
+#include "journal.h"
 #include "trace.h"
 
 #include <linux/prefetch.h>
@@ -424,14 +425,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 		BUG_ON(btree_node_read_in_flight(b) ||
 		       btree_node_write_in_flight(b));
 
-		if (btree_node_dirty(b))
-			bch2_btree_complete_write(c, b, btree_current_write(b));
-		clear_btree_node_dirty_acct(c, b);
-
 		btree_node_data_free(c, b);
 	}
 
-	BUG_ON(atomic_read(&c->btree_cache.dirty));
+	BUG_ON(!bch2_journal_error(&c->journal) &&
+	       atomic_read(&c->btree_cache.dirty));
 
 	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 57c20390e10e..5a720f0cd5a6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1704,8 +1704,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 	return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
 }
 
-void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
-			      struct btree_write *w)
+static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
+				      struct btree_write *w)
 {
 	unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
 
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 7e03dd76fb38..e0d7fa5b1dfb 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -134,9 +134,6 @@ void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
 int bch2_btree_root_read(struct bch_fs *, enum btree_id,
 			 const struct bkey_i *, unsigned);
 
-void bch2_btree_complete_write(struct bch_fs *, struct btree *,
-			      struct btree_write *);
-
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
 enum btree_write_flags {
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 37fbf22de8fc..1b7a5668df7c 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -992,8 +992,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	list_for_each_entry_safe(ck, n, &items, list) {
 		cond_resched();
 
-		bch2_journal_pin_drop(&c->journal, &ck->journal);
-
 		list_del(&ck->list);
 		kfree(ck->k);
 		six_lock_exit(&ck->c.lock);

From ef6fae4a13aecfa7966edff0445e5c920ad2ddd9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 1 Dec 2023 22:31:23 -0500
Subject: [PATCH 02/12] bcachefs; Don't use btree write buffer until journal
 replay is finished

The keys being replayed by journal replay have to be synchronized with
updates by other threads that overwrite them. We rely on btree node
locks for synchronizing - but since btree write buffer updates take no
btree locks, that won't work.

Instead, simply disable using the btree write buffer until journal
replay is finished.

This fixes a rare backpointers error in the merge_torture_flakey test.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 324767c0ddcc..25fdca00bf7b 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -554,6 +554,19 @@ int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
 						 BTREE_UPDATE_PREJOURNAL);
 }
 
+static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans,
+						  enum btree_id btree,
+						  struct bkey_i *k)
+{
+	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
+	int ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	bkey_copy(n, k);
+	return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
 int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
 					    enum btree_id btree,
 					    struct bkey_i *k)
@@ -564,6 +577,9 @@ int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
 	EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
 	EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
 
+	if (unlikely(trans->journal_replay_not_finished))
+		return bch2_btree_insert_clone_trans(trans, btree, k);
+
 	trans_for_each_wb_update(trans, i) {
 		if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
 			bkey_copy(&i->k, k);

From 87b0d8d3d05028c59b64c0287efeca90c28e1152 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 1 Dec 2023 01:14:50 -0500
Subject: [PATCH 03/12] bcachefs: Fix a journal deadlock in replay

Recently, journal pre-reservations were removed. They were for reserving
space ahead of time in the journal for operations that are required for
journal reclaim, e.g. btree key cache flushing and interior node btree
updates.

Instead we have watermarks - only operations for journal reclaim are
allowed when the journal is low on space, and in general we're quite
good about doing operations in the order that will free up space in the
journal quickest when we're low on space. If we're doing a journal
reclaim operation out of order, we usually do it in nonblocking mode if
it's not freeing up space at the end of the journal.

There's an exceptino though - interior btree node update operations have
to be BCH_WATERMARK_reclaim - once they've been started, and they can't
be nonblocking. Generally this is fine because they'll only be a very
small fraction of transaction commits - but there's an exception, which
is during journal replay.

Journal replay does many btree operations, but doesn't need to commit
them to the journal since they're already in the journal. So killing off
of pre-reservation, plus another change to make journal replay more
efficient by initially doing the replay in sorted btree order, made it
possible for the interior update operations replay generates to fill and
deadlock the journal.

Fix this by introducing a new check on journal space at the _start_ of
an interior update operation. This causes us to block if necessary in
exactly the same way as we used to when interior updates took a journal
pre-reservaiton, but without all the expensive accounting
pre-reservations required.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6697417273aa..26be38ab6ecb 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1056,6 +1056,17 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	flags &= ~BCH_WATERMARK_MASK;
 	flags |= watermark;
 
+	if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+	    watermark < c->journal.watermark) {
+		struct journal_res res = { 0 };
+
+		ret = drop_locks_do(trans,
+			bch2_journal_res_get(&c->journal, &res, 1,
+					     watermark|JOURNAL_RES_GET_CHECK));
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
 	while (1) {
 		nr_nodes[!!update_level] += 1 + split;
 		update_level++;

From 131898b0cb4ac6598d3537eeeee2711dec129f51 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 2 Dec 2023 02:43:58 -0500
Subject: [PATCH 04/12] bcachefs: Fix bch2_extent_drop_ptrs() call

Also, make bch2_extent_drop_ptrs() safer, so it works with extents and
non-extents iterators.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 71aa5e59787b..2418c528c533 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -471,7 +471,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
 	 * we aren't using the extent overwrite path to delete, we're
 	 * just using the normal key deletion path:
 	 */
-	if (bkey_deleted(&n->k))
+	if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS))
 		n->k.size = 0;
 
 	return bch2_trans_relock(trans) ?:
@@ -591,7 +591,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 		m->data_opts.rewrite_ptrs = 0;
 		/* if iter == NULL, it's just a promote */
 		if (iter)
-			ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+			ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
 		goto done;
 	}
 

From adcf4ee64291b701d083bacf653eb10a4c46acd7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 4 Dec 2023 00:38:56 -0500
Subject: [PATCH 05/12] bcachefs: Convert compression_stats to
 for_each_btree_key2

for_each_btree_key2() runs each loop iteration in a btree transaction,
and thus does not cause SRCU lock hold time problems.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ab743115f169..f3cb7115b530 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -276,8 +276,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 		if (!btree_type_has_ptrs(id))
 			continue;
 
-		for_each_btree_key(trans, iter, id, POS_MIN,
-				   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		ret = for_each_btree_key2(trans, iter, id, POS_MIN,
+					  BTREE_ITER_ALL_SNAPSHOTS, k, ({
 			struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 			const union bch_extent_entry *entry;
 			struct extent_ptr_decoded p;
@@ -309,8 +309,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 				nr_uncompressed_extents++;
 			else if (compressed)
 				nr_compressed_extents++;
-		}
-		bch2_trans_iter_exit(trans, &iter);
+			0;
+		}));
 	}
 
 	bch2_trans_put(trans);

From f88d811a238b12a261a04f125db952cf05c06d0b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 3 Dec 2023 13:05:21 -0500
Subject: [PATCH 06/12] bcachefs: Don't run indirect extent trigger unless
 inserting/deleting

This fixes a transaction path overflow reported in the snapshot deletion
path, when moving extents to the correct snapshot.

The root of the issue is that creating/deleting a reflink pointer can
generate an unbounded number of updates, if it is allowed to reference
an unbounded number of indirect extents; to prevent this, merging of
reflink pointers has been disabled.

But there's a hole, which is that copygc/rebalance may fragment existing
extents in the course of moving them around, and if an indirect extent
becomes too fragmented we'll then become unable to delete the reflink
pointer.

The eventual solution is going to be to tweak trigger handling so that
we can process large reflink pointers incrementally when necessary, and
notice that trigger updates don't need to be run for the part of the
reflink pointer not changing. That is going to be a bigger project
though, for another patch.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/reflink.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 6e1bfe9feb59..37d16e04e671 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -121,6 +121,14 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans,
 {
 	check_indirect_extent_deleting(new, &flags);
 
+	if (old.k->type == KEY_TYPE_reflink_v &&
+	    new->k.type == KEY_TYPE_reflink_v &&
+	    old.k->u64s == new->k.u64s &&
+	    !memcmp(bkey_s_c_to_reflink_v(old).v->start,
+		    bkey_i_to_reflink_v(new)->v.start,
+		    bkey_val_bytes(&new->k) - 8))
+		return 0;
+
 	return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
 }
 

From 7aebaabfede75feda5c5d16991da74124aee428d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 4 Dec 2023 15:44:15 -0500
Subject: [PATCH 07/12] bcachefs: Fix creating snapshot with implict source

When creating a snapshot without specifying the source subvolume, we use
the subvolume containing the new snapshot.

Previously, this worked if the directory containing the new snapshot was
the subvolume root - but we were using the incorrect helper, and got a
subvolume ID of 0 when the parent directory wasn't the root of the
subvolume, causing an emergency read-only.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 5a39bcb597a3..a70b7a03057d 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -413,7 +413,7 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
 
 	if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
 	    !arg.src_ptr)
-		snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
+		snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
 
 	inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
 			      dst_dentry, arg.mode|S_IFDIR,

From 5796230582f6131fa217f0a1700783c459c847d2 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 5 Dec 2023 08:24:38 -0500
Subject: [PATCH 08/12] bcachefs: don't attempt rw on unfreeze when shutdown

The internal freeze mechanism in bcachefs mostly reuses the generic
rw<->ro transition code. If the fs happens to shutdown during or
after freeze, a transition back to rw can fail. This is expected,
but returning an error from the unfreeze callout prevents the
filesystem from being unfrozen.

Skip the read write transition if the fs is shutdown. This allows
the fs to unfreeze at the vfs level so writes will no longer block,
but will still fail due to the emergency read-only state of the fs.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 4d51be813509..371565e02ff2 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1733,6 +1733,9 @@ static int bch2_unfreeze(struct super_block *sb)
 	struct bch_fs *c = sb->s_fs_info;
 	int ret;
 
+	if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+		return 0;
+
 	down_write(&c->state_lock);
 	ret = bch2_fs_read_write(c);
 	up_write(&c->state_lock);

From e59728883943c6820a0aa413db66a38f2e8c27bd Mon Sep 17 00:00:00 2001
From: Daniel Hill <daniel@gluo.nz>
Date: Wed, 6 Dec 2023 21:26:00 +1300
Subject: [PATCH 09/12] bcachefs: rebalance shouldn't attempt to compress
 unwritten extents

This fixes a bug where rebalance would loop repeatedly on the same
extents.

Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index f6c92df55270..9d8afcb5979a 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1294,7 +1294,8 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
 		unsigned i = 0;
 
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) {
+			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+			    p.ptr.unwritten) {
 				rewrite_ptrs = 0;
 				goto incompressible;
 			}

From 6d1980f0af439b5fd49b1bee2220deff6888792e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 7 Dec 2023 12:39:13 -0500
Subject: [PATCH 10/12] bcachefs: Fix deleted inode check for dirs

We could delete directories transactionally on rmdir()/unlink(), but we
don't; instead, like with regular files we wait for the VFS to call
evict().

That means that our check for directories in the deleted inodes btree is
wrong - the check should be for non-empty directories.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 19 +++++++++++--------
 fs/bcachefs/dirent.h |  1 +
 fs/bcachefs/inode.c  | 15 ++++++++++-----
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 1a0f2d571569..2bfff0da7000 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -485,20 +485,15 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
 	return ret;
 }
 
-int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
+int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	u32 snapshot;
 	int ret;
 
-	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
-	if (ret)
-		return ret;
-
 	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
-			   SPOS(dir.inum, 0, snapshot),
-			   POS(dir.inum, U64_MAX), 0, k, ret)
+			   SPOS(dir, 0, snapshot),
+			   POS(dir, U64_MAX), 0, k, ret)
 		if (k.k->type == KEY_TYPE_dirent) {
 			ret = -ENOTEMPTY;
 			break;
@@ -508,6 +503,14 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 	return ret;
 }
 
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
+{
+	u32 snapshot;
+
+	return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
+		bch2_empty_dir_snapshot(trans, dir.inum, snapshot);
+}
+
 int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index cd262bf4d9c5..1e3431990abd 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -64,6 +64,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
 		       const struct bch_hash_info *,
 		       const struct qstr *, subvol_inum *);
 
+int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32);
 int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
 int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index c7849b0753e7..9309cfeecd8d 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -7,6 +7,7 @@
 #include "btree_update.h"
 #include "buckets.h"
 #include "compress.h"
+#include "dirent.h"
 #include "error.h"
 #include "extents.h"
 #include "extent_update.h"
@@ -1093,11 +1094,15 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
-			deleted_inode_is_dir,
-			"directory %llu:%u in deleted_inodes btree",
-			pos.offset, pos.snapshot))
-		goto delete;
+	if (S_ISDIR(inode.bi_mode)) {
+		ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
+		if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
+				"non empty directory %llu:%u in deleted_inodes btree",
+				pos.offset, pos.snapshot))
+			goto delete;
+		if (ret)
+			goto out;
+	}
 
 	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
 			deleted_inode_not_unlinked,

From 4a147af2081070218a4c66523c584e198994528e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Dec 2023 12:21:42 -0500
Subject: [PATCH 11/12] bcachefs: Fix uninitialized var in
 bch2_journal_replay()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 770ced1c6285..c7d9074c82d9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -144,7 +144,7 @@ static int bch2_journal_replay(struct bch_fs *c)
 	u64 start_seq	= c->journal_replay_seq_start;
 	u64 end_seq	= c->journal_replay_seq_start;
 	size_t i;
-	int ret;
+	int ret = 0;
 
 	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
 	keys->gap = keys->nr;

From a66ff26b0f31189e413a87065c25949c359e4bef Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 10 Dec 2023 15:23:27 -0500
Subject: [PATCH 12/12] bcachefs: Close journal entry if necessary when
 flushing all pins

Since outstanding journal buffers hold a journal pin, when flushing all
pins we need to close the current journal entry if necessary so its pin
can be released.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c         | 8 ++++----
 fs/bcachefs/journal.h         | 1 +
 fs/bcachefs/journal_io.c      | 1 +
 fs/bcachefs/journal_reclaim.c | 3 +++
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 489b34046e78..8cf238be6213 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -249,7 +249,7 @@ static bool journal_entry_want_write(struct journal *j)
 	return ret;
 }
 
-static bool journal_entry_close(struct journal *j)
+bool bch2_journal_entry_close(struct journal *j)
 {
 	bool ret;
 
@@ -383,7 +383,7 @@ static bool journal_quiesced(struct journal *j)
 	bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
 
 	if (!ret)
-		journal_entry_close(j);
+		bch2_journal_entry_close(j);
 	return ret;
 }
 
@@ -436,7 +436,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 
 	/*
 	 * Recheck after taking the lock, so we don't race with another thread
-	 * that just did journal_entry_open() and call journal_entry_close()
+	 * that just did journal_entry_open() and call bch2_journal_entry_close()
 	 * unnecessarily
 	 */
 	if (journal_res_get_fast(j, res, flags)) {
@@ -1041,7 +1041,7 @@ void bch2_fs_journal_stop(struct journal *j)
 	bch2_journal_reclaim_stop(j);
 	bch2_journal_flush_all_pins(j);
 
-	wait_event(j->wait, journal_entry_close(j));
+	wait_event(j->wait, bch2_journal_entry_close(j));
 
 	/*
 	 * Always write a new journal entry, to make sure the clock hands are up
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 4c513fca5ef2..2f768e11aec9 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -266,6 +266,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
 	return s;
 }
 
+bool bch2_journal_entry_close(struct journal *);
 void bch2_journal_buf_put_final(struct journal *, u64, bool);
 
 static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 0f17fc5f8d68..5de1b68fb8af 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1599,6 +1599,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
+	bch2_journal_reclaim_fast(j);
 	bch2_journal_space_available(j);
 
 	closure_wake_up(&w->wait);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index e63c6eda86af..ec712104addb 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -776,6 +776,9 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 			       (1U << JOURNAL_PIN_btree), 0, 0, 0))
 		*did_work = true;
 
+	if (seq_to_flush > journal_cur_seq(j))
+		bch2_journal_entry_close(j);
+
 	spin_lock(&j->lock);
 	/*
 	 * If journal replay hasn't completed, the unreplayed journal entries