From 09fe05c57b5aaf23e2c35036c98ea9f282b19a77 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 5 Jun 2023 16:33:35 +0200 Subject: [PATCH 1/3] rbd: move RBD_OBJ_FLAG_COPYUP_ENABLED flag setting Move RBD_OBJ_FLAG_COPYUP_ENABLED flag setting into the object request state machine to allow for the snapshot context to be captured in the image request state machine rather than in rbd_queue_workfn(). Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 84ad3b17956f..6c847db6ee2c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1334,14 +1334,28 @@ static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) /* * Must be called after rbd_obj_calc_img_extents(). */ -static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req) +static void rbd_obj_set_copyup_enabled(struct rbd_obj_request *obj_req) { - if (!obj_req->num_img_extents || - (rbd_obj_is_entire(obj_req) && - !obj_req->img_request->snapc->num_snaps)) - return false; + if (obj_req->img_request->op_type == OBJ_OP_DISCARD) { + dout("%s %p objno %llu discard\n", __func__, obj_req, + obj_req->ex.oe_objno); + return; + } - return true; + if (!obj_req->num_img_extents) { + dout("%s %p objno %llu not overlapping\n", __func__, obj_req, + obj_req->ex.oe_objno); + return; + } + + if (rbd_obj_is_entire(obj_req) && + !obj_req->img_request->snapc->num_snaps) { + dout("%s %p objno %llu entire\n", __func__, obj_req, + obj_req->ex.oe_objno); + return; + } + + obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; } static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) @@ -2233,9 +2247,6 @@ static int rbd_obj_init_write(struct rbd_obj_request *obj_req) if (ret) return ret; - if (rbd_obj_copyup_enabled(obj_req)) - obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; - obj_req->write_state = RBD_OBJ_WRITE_START; return 0; } @@ -2341,8 +2352,6 @@ static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req) if (ret) return ret; - if (rbd_obj_copyup_enabled(obj_req)) - obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; if (!obj_req->num_img_extents) { obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT; if (rbd_obj_is_entire(obj_req)) @@ -3286,6 +3295,7 @@ static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result) case RBD_OBJ_WRITE_START: rbd_assert(!*result); + rbd_obj_set_copyup_enabled(obj_req); if (rbd_obj_write_is_noop(obj_req)) return true; From 870611e4877eff1e8413c3fb92a585e45d5291f6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 5 Jun 2023 16:33:35 +0200 Subject: [PATCH 2/3] rbd: get snapshot context after exclusive lock is ensured to be held Move capturing the snapshot context into the image request state machine, after exclusive lock is ensured to be held for the duration of dealing with the image request. This is needed to ensure correctness of fast-diff states (OBJECT_EXISTS vs OBJECT_EXISTS_CLEAN) and object deltas computed based off of them. Otherwise the object map that is forked for the snapshot isn't guaranteed to accurately reflect the contents of the snapshot when the snapshot is taken under I/O. This breaks differential backup and snapshot-based mirroring use cases with fast-diff enabled: since some object deltas may be incomplete, the destination image may get corrupted. Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/61472 Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6c847db6ee2c..632751ddb287 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1336,6 +1336,8 @@ static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) */ static void rbd_obj_set_copyup_enabled(struct rbd_obj_request *obj_req) { + rbd_assert(obj_req->img_request->snapc); + if (obj_req->img_request->op_type == OBJ_OP_DISCARD) { dout("%s %p objno %llu discard\n", __func__, obj_req, obj_req->ex.oe_objno); @@ -1456,6 +1458,7 @@ __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, static struct ceph_osd_request * rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops) { + rbd_assert(obj_req->img_request->snapc); return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc, num_ops); } @@ -1592,15 +1595,18 @@ static void rbd_img_request_init(struct rbd_img_request *img_request, mutex_init(&img_request->state_mutex); } +/* + * Only snap_id is captured here, for reads. For writes, snapshot + * context is captured in rbd_img_object_requests() after exclusive + * lock is ensured to be held. + */ static void rbd_img_capture_header(struct rbd_img_request *img_req) { struct rbd_device *rbd_dev = img_req->rbd_dev; lockdep_assert_held(&rbd_dev->header_rwsem); - if (rbd_img_is_write(img_req)) - img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc); - else + if (!rbd_img_is_write(img_req)) img_req->snap_id = rbd_dev->spec->snap_id; if (rbd_dev_parent_get(rbd_dev)) @@ -3482,9 +3488,19 @@ static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) static void rbd_img_object_requests(struct rbd_img_request *img_req) { + struct rbd_device *rbd_dev = img_req->rbd_dev; struct rbd_obj_request *obj_req; rbd_assert(!img_req->pending.result && !img_req->pending.num_pending); + rbd_assert(!need_exclusive_lock(img_req) || + __rbd_is_lock_owner(rbd_dev)); + + if (rbd_img_is_write(img_req)) { + rbd_assert(!img_req->snapc); + down_read(&rbd_dev->header_rwsem); + img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc); + up_read(&rbd_dev->header_rwsem); + } for_each_obj_request(img_req, obj_req) { int result = 0; @@ -3502,7 +3518,6 @@ static void rbd_img_object_requests(struct rbd_img_request *img_req) static bool rbd_img_advance(struct rbd_img_request *img_req, int *result) { - struct rbd_device *rbd_dev = img_req->rbd_dev; int ret; again: @@ -3523,9 +3538,6 @@ static bool rbd_img_advance(struct rbd_img_request *img_req, int *result) if (*result) return true; - rbd_assert(!need_exclusive_lock(img_req) || - __rbd_is_lock_owner(rbd_dev)); - rbd_img_object_requests(img_req); if (!img_req->pending.num_pending) { *result = img_req->pending.result; @@ -3987,6 +3999,10 @@ static int rbd_post_acquire_action(struct rbd_device *rbd_dev) { int ret; + ret = rbd_dev_refresh(rbd_dev); + if (ret) + return ret; + if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) { ret = rbd_object_map_open(rbd_dev); if (ret) From 409e873ea3c1fd3079909718bbeb06ac1ec7f38b Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 1 Jun 2023 08:59:31 +0800 Subject: [PATCH 3/3] ceph: fix use-after-free bug for inodes when flushing capsnaps There is a race between capsnaps flush and removing the inode from 'mdsc->snap_flush_list' list: == Thread A == == Thread B == ceph_queue_cap_snap() -> allocate 'capsnapA' ->ihold('&ci->vfs_inode') ->add 'capsnapA' to 'ci->i_cap_snaps' ->add 'ci' to 'mdsc->snap_flush_list' ... == Thread C == ceph_flush_snaps() ->__ceph_flush_snaps() ->__send_flush_snap() handle_cap_flushsnap_ack() ->iput('&ci->vfs_inode') this also will release 'ci' ... == Thread D == ceph_handle_snap() ->flush_snaps() ->iterate 'mdsc->snap_flush_list' ->get the stale 'ci' ->remove 'ci' from ->ihold(&ci->vfs_inode) this 'mdsc->snap_flush_list' will WARNING To fix this we will increase the inode's i_count ref when adding 'ci' to the 'mdsc->snap_flush_list' list. [ idryomov: need_put int -> bool ] Cc: stable@vger.kernel.org Link: https://bugzilla.redhat.com/show_bug.cgi?id=2209299 Signed-off-by: Xiubo Li Reviewed-by: Milind Changire Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 6 ++++++ fs/ceph/snap.c | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 789be30d6ee2..2321e5ddb664 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1627,6 +1627,7 @@ void ceph_flush_snaps(struct ceph_inode_info *ci, struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; + bool need_put = false; int mds; dout("ceph_flush_snaps %p\n", inode); @@ -1671,8 +1672,13 @@ void ceph_flush_snaps(struct ceph_inode_info *ci, ceph_put_mds_session(session); /* we flushed them all; remove this inode from the queue */ spin_lock(&mdsc->snap_flush_lock); + if (!list_empty(&ci->i_snap_flush_item)) + need_put = true; list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); + + if (need_put) + iput(inode); } /* diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 0b236ebd989f..2e73ba62bd7a 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -693,8 +693,10 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->size); spin_lock(&mdsc->snap_flush_lock); - if (list_empty(&ci->i_snap_flush_item)) + if (list_empty(&ci->i_snap_flush_item)) { + ihold(inode); list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + } spin_unlock(&mdsc->snap_flush_lock); return 1; /* caller may want to ceph_flush_snaps */ }