From 49ece554288caf1a8ea9e546ab1ff5bc4b175456 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 08:37:00 -0500 Subject: [PATCH 01/33] rbd: fix leak of format 2 snapshot context When rbd_dev_v2_refresh() is called, the rbd device already has a snapshot context associated with it. But that never gets freed, the pointer just gets overwritten. Fix this by dropping the rbd device's reference to the snapshot context before overwriting the pointer. Because ceph_put_snap_context() already handles for a null pointer we don't need to check for that (for the probe case, where no context has yet been assigned). This resolves: http://tracker.ceph.com/issues/4912 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c2ca1818f335..426374321d75 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4004,6 +4004,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) for (i = 0; i < snap_count; i++) snapc->snaps[i] = ceph_decode_64(&p); + ceph_put_snap_context(rbd_dev->header.snapc); rbd_dev->header.snapc = snapc; dout(" snap context seq = %llu, snap_count = %u\n", From e627db085e0dab7744b68f3c927be6ed6df2f7f9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: [PATCH 02/33] rbd: revalidate only for mapping size changes This commit: d98df63e rbd: revalidate_disk upon rbd resize instituted a call to revalidate_disk() to notify interested parties that a mapped image has changed size. This works well, as long as the the rbd device doesn't map a snapshot. A snapshot will never change size. However, the base image the snapshot is associated with can, and it can do so while the snapshot is mapped. The problem is that the test for the size is looking at the size of the base image, not the size of the mapped snapshot. This patch corrects that. Update the warning message shown in the event of error, and move it into the callers. This resolves: http://tracker.ceph.com/issues/4911 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 426374321d75..5d5e3f0b5fb4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2628,6 +2628,7 @@ out: static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { struct rbd_device *rbd_dev = (struct rbd_device *)data; + int ret; if (!rbd_dev) return; @@ -2635,7 +2636,9 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, rbd_dev->header_name, (unsigned long long)notify_id, (unsigned int)opcode); - (void)rbd_dev_refresh(rbd_dev); + ret = rbd_dev_refresh(rbd_dev); + if (ret) + rbd_warn(rbd_dev, ": header refresh error (%d)\n", ret); rbd_obj_notify_ack(rbd_dev, notify_id); } @@ -3182,11 +3185,11 @@ static void rbd_exists_validate(struct rbd_device *rbd_dev) static int rbd_dev_refresh(struct rbd_device *rbd_dev) { - u64 image_size; + u64 mapping_size; int ret; rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); - image_size = rbd_dev->header.image_size; + mapping_size = rbd_dev->mapping.size; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); if (rbd_dev->image_format == 1) ret = rbd_dev_v1_refresh(rbd_dev); @@ -3197,10 +3200,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) rbd_exists_validate(rbd_dev); mutex_unlock(&ctl_mutex); - if (ret) - rbd_warn(rbd_dev, "got notification but failed to " - " update snaps: %d\n", ret); - if (image_size != rbd_dev->header.image_size) + if (mapping_size != rbd_dev->mapping.size) revalidate_disk(rbd_dev->disk); return ret; @@ -3405,6 +3405,8 @@ static ssize_t rbd_image_refresh(struct device *dev, int ret; ret = rbd_dev_refresh(rbd_dev); + if (ret) + rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret); return ret < 0 ? ret : size; } From 00a653e216a8427547774ab3f2cc92709c3e28c9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: [PATCH 03/33] rbd: update capacity in rbd_dev_refresh() When a mapped image changes size, we change the capacity recorded for the Linux disk associated with it, in rbd_update_mapping_size(). That function is called in two places--the format 1 and format 2 refresh routines. There is no need to set the capacity while holding the header semaphore. Instead, do it in the common rbd_dev_refresh(), using the logic that's already there to initiate disk revalidation. Add handling in the request function, just in case a request that exceeds the capacity of the device comes in (perhaps one that was started before a refresh shrunk the device). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5d5e3f0b5fb4..9c094c67d778 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2874,6 +2874,13 @@ static void rbd_request_fn(struct request_queue *q) goto end_request; /* Shouldn't happen */ } + result = -EIO; + if (offset + length > rbd_dev->mapping.size) { + rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", + offset, length, rbd_dev->mapping.size); + goto end_request; + } + result = -ENOMEM; img_request = rbd_img_request_create(rbd_dev, offset, length, write_request, false); @@ -3116,14 +3123,8 @@ static void rbd_update_mapping_size(struct rbd_device *rbd_dev) if (rbd_dev->spec->snap_id != CEPH_NOSNAP) return; - if (rbd_dev->mapping.size != rbd_dev->header.image_size) { - sector_t size; - + if (rbd_dev->mapping.size != rbd_dev->header.image_size) rbd_dev->mapping.size = rbd_dev->header.image_size; - size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; - dout("setting size to %llu sectors", (unsigned long long)size); - set_capacity(rbd_dev->disk, size); - } } /* @@ -3200,8 +3201,14 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) rbd_exists_validate(rbd_dev); mutex_unlock(&ctl_mutex); - if (mapping_size != rbd_dev->mapping.size) + if (mapping_size != rbd_dev->mapping.size) { + sector_t size; + + size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; + dout("setting size to %llu sectors", (unsigned long long)size); + set_capacity(rbd_dev->disk, size); revalidate_disk(rbd_dev->disk); + } return ret; } From 29334ba49c3e3defd9a2697cd4a199c597c30dc9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: [PATCH 04/33] rbd: kill rbd_update_mapping_size() Since rbd_update_mapping_size() is now a trivial wrapper, just open code it in its two callers. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9c094c67d778..4c9869545073 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3118,15 +3118,6 @@ static int rbd_read_header(struct rbd_device *rbd_dev, return ret; } -static void rbd_update_mapping_size(struct rbd_device *rbd_dev) -{ - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) - return; - - if (rbd_dev->mapping.size != rbd_dev->header.image_size) - rbd_dev->mapping.size = rbd_dev->header.image_size; -} - /* * only read the first part of the ondisk header, without the snaps info */ @@ -3143,7 +3134,9 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) /* Update image size, and check for resize of mapped image */ rbd_dev->header.image_size = h.image_size; - rbd_update_mapping_size(rbd_dev); + if (rbd_dev->spec->snap_id == CEPH_NOSNAP) + if (rbd_dev->mapping.size != rbd_dev->header.image_size) + rbd_dev->mapping.size = rbd_dev->header.image_size; /* rbd_dev->header.object_prefix shouldn't change */ kfree(rbd_dev->header.snap_sizes); @@ -4074,7 +4067,9 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev) ret = rbd_dev_v2_image_size(rbd_dev); if (ret) goto out; - rbd_update_mapping_size(rbd_dev); + if (rbd_dev->spec->snap_id == CEPH_NOSNAP) + if (rbd_dev->mapping.size != rbd_dev->header.image_size) + rbd_dev->mapping.size = rbd_dev->header.image_size; ret = rbd_dev_v2_snap_context(rbd_dev); dout("rbd_dev_v2_snap_context returned %d\n", ret); From c734b79655a91a24afcae73738a43a0db09a801a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: [PATCH 05/33] rbd: don't print warning if not mapping a parent The presence of the LAYERING bit in an rbd image's feature mask does not guarantee the image actually has a parent image. Currently that bit is set only when a clone (i.e., image with a parent) is created, but it is (currently) not cleared if that clone gets flattened back into a "normal" image. A "parent_id" query will leave the parent_spec for the image being mapped a null pointer, but will not return an error. Currently, whenever an image with the LAYERED feature gets mapped, a warning about the use of layered images gets printed. But we don't want to do this for a flattened image, so print the warning only if we find there is a parent spec after the probe. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4c9869545073..2d34aea772be 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4567,13 +4567,14 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) ret = rbd_dev_v2_parent_info(rbd_dev); if (ret) goto out_err; - /* - * Don't print a warning for parent images. We can - * tell this point because we won't know its pool - * name yet (just its pool id). + * Print a warning if this image has a parent. + * Don't print it if the image now being probed + * is itself a parent. We can tell at this point + * because we won't know its pool name yet (just its + * pool id). */ - if (rbd_dev->spec->pool_name) + if (rbd_dev->parent_spec && rbd_dev->spec->pool_name) rbd_warn(rbd_dev, "WARNING: kernel layering " "is EXPERIMENTAL!"); } From 8f4b7d9821715767ac28bbc2d401bbb5f3f9a448 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: [PATCH 06/33] rbd: don't look up snapshot id in rbd_dev_mapping_set() Currently rbd_dev_mapping_set() looks up the snapshot id for the snapshot whose name is found in the rbd device's spec structure. That function gets called by rbd_dev_device_setup(), which is called by rbd_add() *after* rbd_dev_image_probe(). If the image probe succeeds, the rbd device's spec will already have been updated to include names and ids for all fields. Therefore there's no need to look up the snapshot id in rbd_dev_mapping_set(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2d34aea772be..75a8cea1a2d0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -936,20 +936,11 @@ static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) { - const char *snap_name = rbd_dev->spec->snap_name; - u64 snap_id; + u64 snap_id = rbd_dev->spec->snap_id; u64 size = 0; u64 features = 0; int ret; - if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) { - snap_id = rbd_snap_id_by_name(rbd_dev, snap_name); - if (snap_id == CEPH_NOSNAP) - return -ENOENT; - } else { - snap_id = CEPH_NOSNAP; - } - ret = rbd_snap_size(rbd_dev, snap_id, &size); if (ret) return ret; From 6d80b130d516deef51666e210fde674c947b8b5c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: [PATCH 07/33] rbd: kill rbd_dev_clear_mapping() This function is a duplicate of rbd_dev_mapping_clear(), and was added by mistake. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 75a8cea1a2d0..a62e59a0aea7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -966,13 +966,6 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.read_only = true; } -static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev) -{ - rbd_dev->mapping.size = 0; - rbd_dev->mapping.features = 0; - rbd_dev->mapping.read_only = true; -} - static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) { char *name; @@ -4910,7 +4903,7 @@ static void rbd_dev_device_release(struct device *dev) rbd_free_disk(rbd_dev); clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); - rbd_dev_clear_mapping(rbd_dev); + rbd_dev_mapping_clear(rbd_dev); unregister_blkdev(rbd_dev->major, rbd_dev->name); rbd_dev->major = 0; rbd_dev_id_put(rbd_dev); From 51344a38ba2033be18a4ec23e318845caeccdc04 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 07:40:30 -0500 Subject: [PATCH 08/33] rbd: always set read-only flag in rbd_add() Hold off setting the read-only flag in rbd_add() for an image being mapped until we have successfully probed the image. At that point we know whether it's a snapshot mapping or not, so we can set the read-only flag in that one place rather than doing so (for snapshots) in rbd_dev_mapping_set(). To do this, pass a flag to the image probe routine indicating whether we want a read-only mapping. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a62e59a0aea7..b5cac7931ffc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -359,7 +359,7 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); -static int rbd_dev_image_probe(struct rbd_device *rbd_dev); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -951,11 +951,6 @@ static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) rbd_dev->mapping.size = size; rbd_dev->mapping.features = features; - /* If we are mapping a snapshot it must be marked read-only */ - - if (snap_id != CEPH_NOSNAP) - rbd_dev->mapping.read_only = true; - return 0; } @@ -963,7 +958,6 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) { rbd_dev->mapping.size = 0; rbd_dev->mapping.features = 0; - rbd_dev->mapping.read_only = true; } static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) @@ -4620,7 +4614,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) if (!parent) goto out_err; - ret = rbd_dev_image_probe(parent); + ret = rbd_dev_image_probe(parent, true); if (ret < 0) goto out_err; rbd_dev->parent = parent; @@ -4743,7 +4737,7 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) * device. For format 2 images this includes determining the image * id. */ -static int rbd_dev_image_probe(struct rbd_device *rbd_dev) +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) { int ret; int tmp; @@ -4778,6 +4772,12 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) if (ret) goto err_out_probe; + /* If we are mapping a snapshot it must be marked read-only */ + + if (rbd_dev->spec->snap_id != CEPH_NOSNAP) + read_only = true; + rbd_dev->mapping.read_only = read_only; + ret = rbd_dev_probe_parent(rbd_dev); if (!ret) return 0; @@ -4811,6 +4811,7 @@ static ssize_t rbd_add(struct bus_type *bus, struct rbd_spec *spec = NULL; struct rbd_client *rbdc; struct ceph_osd_client *osdc; + bool read_only; int rc = -ENOMEM; if (!try_module_get(THIS_MODULE)) @@ -4820,6 +4821,9 @@ static ssize_t rbd_add(struct bus_type *bus, rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); if (rc < 0) goto err_out_module; + read_only = rbd_opts->read_only; + kfree(rbd_opts); + rbd_opts = NULL; /* done with this */ rbdc = rbd_get_client(ceph_opts); if (IS_ERR(rbdc)) { @@ -4850,11 +4854,7 @@ static ssize_t rbd_add(struct bus_type *bus, rbdc = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */ - rbd_dev->mapping.read_only = rbd_opts->read_only; - kfree(rbd_opts); - rbd_opts = NULL; /* done with this */ - - rc = rbd_dev_image_probe(rbd_dev); + rc = rbd_dev_image_probe(rbd_dev, read_only); if (rc < 0) goto err_out_rbd_dev; From f35a4dee14c31dc00807f3bcd59cc7a6959f63d7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:29 -0500 Subject: [PATCH 09/33] rbd: set the mapping size and features later Defer setting the size and features fields of a mapped image until after the Linux disk structure is set up. Set the capacity of the disk after that. Rearrange the definition of rbd_image_header, separating the fields that are set only once from those that can be updated. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b5cac7931ffc..a05b6e5dc362 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -100,21 +100,20 @@ * block device image metadata (in-memory version) */ struct rbd_image_header { - /* These four fields never change for a given rbd image */ + /* These six fields never change for a given rbd image */ char *object_prefix; - u64 features; __u8 obj_order; __u8 crypt_type; __u8 comp_type; + u64 stripe_unit; + u64 stripe_count; + u64 features; /* Might be changeable someday? */ /* The remaining fields need to be updated occasionally */ u64 image_size; struct ceph_snap_context *snapc; - char *snap_names; - u64 *snap_sizes; - - u64 stripe_unit; - u64 stripe_count; + char *snap_names; /* format 1 only */ + u64 *snap_sizes; /* format 1 only */ }; /* @@ -4637,10 +4636,6 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) { int ret; - ret = rbd_dev_mapping_set(rbd_dev); - if (ret) - return ret; - /* generate unique id: find highest unique id, add one */ rbd_dev_id_get(rbd_dev); @@ -4662,13 +4657,17 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) if (ret) goto err_out_blkdev; - ret = rbd_bus_add_dev(rbd_dev); + ret = rbd_dev_mapping_set(rbd_dev); if (ret) goto err_out_disk; + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); + + ret = rbd_bus_add_dev(rbd_dev); + if (ret) + goto err_out_mapping; /* Everything's ready. Announce the disk to the world. */ - set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); add_disk(rbd_dev->disk); @@ -4677,6 +4676,8 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) return ret; +err_out_mapping: + rbd_dev_mapping_clear(rbd_dev); err_out_disk: rbd_free_disk(rbd_dev); err_out_blkdev: From 46578dcdca951f3da70d3a5a9b5166b2a492a182 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:29 -0500 Subject: [PATCH 10/33] rbd: zero format 1 header structure earlier The passed-in header structure is zeroed in rbd_header_from_disk(). Instead, have the caller do it. Note that there are two callers, rbd_dev_v1_refresh() and rbd_dev_v1_probe(). The latter already has a zeroed header structure so zeroing it isn't necessary there. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a05b6e5dc362..b0fbc38cc6f0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -738,8 +738,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header, size_t size; u32 i; - memset(header, 0, sizeof (*header)); - snap_count = le32_to_cpu(ondisk->snap_count); len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); @@ -3103,6 +3101,7 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) int ret; struct rbd_image_header h; + memset(&h, 0, sizeof (h)); ret = rbd_read_header(rbd_dev, &h); if (ret < 0) return ret; From bb23e37acb2ae9604130c4819fb8ae0f784a3a2b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:29 -0500 Subject: [PATCH 11/33] rbd: refactor rbd_header_from_disk() This rearranges rbd_header_from_disk so that it: - allocates the snapshot context right away - keeps results in local variables, not changing the passed-in header until it's known we'll succeed - does initialization of set-once fields in a header only if they have not already been set The last point is moot at the moment, because rbd_read_header() (the only caller) always supplies a zero-filled header buffer. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 129 ++++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 53 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b0fbc38cc6f0..a3b6bf5e9ae8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -727,86 +727,109 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) } /* - * Create a new header structure, translate header format from the on-disk - * header. + * Fill an rbd image header with information from the given format 1 + * on-disk header. */ static int rbd_header_from_disk(struct rbd_image_header *header, struct rbd_image_header_ondisk *ondisk) { + bool first_time = header->object_prefix == NULL; + struct ceph_snap_context *snapc; + char *object_prefix = NULL; + char *snap_names = NULL; + u64 *snap_sizes = NULL; u32 snap_count; - size_t len; size_t size; + int ret = -ENOMEM; u32 i; + /* Allocate this now to avoid having to handle failure below */ + + if (first_time) { + size_t len; + + len = strnlen(ondisk->object_prefix, + sizeof (ondisk->object_prefix)); + object_prefix = kmalloc(len + 1, GFP_KERNEL); + if (!object_prefix) + return -ENOMEM; + memcpy(object_prefix, ondisk->object_prefix, len); + object_prefix[len] = '\0'; + } + + /* Allocate the snapshot context and fill it in */ + snap_count = le32_to_cpu(ondisk->snap_count); - - len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); - header->object_prefix = kmalloc(len + 1, GFP_KERNEL); - if (!header->object_prefix) - return -ENOMEM; - memcpy(header->object_prefix, ondisk->object_prefix, len); - header->object_prefix[len] = '\0'; - + snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); + if (!snapc) + goto out_err; + snapc->seq = le64_to_cpu(ondisk->snap_seq); if (snap_count) { + struct rbd_image_snap_ondisk *snaps; u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); - /* Save a copy of the snapshot names */ + /* We'll keep a copy of the snapshot names... */ - if (snap_names_len > (u64) SIZE_MAX) - return -EIO; - header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); - if (!header->snap_names) + if (snap_names_len > (u64)SIZE_MAX) + goto out_2big; + snap_names = kmalloc(snap_names_len, GFP_KERNEL); + if (!snap_names) goto out_err; + + /* ...as well as the array of their sizes. */ + + size = snap_count * sizeof (*header->snap_sizes); + snap_sizes = kmalloc(size, GFP_KERNEL); + if (!snap_sizes) + goto out_err; + /* - * Note that rbd_dev_v1_header_read() guarantees - * the ondisk buffer we're working with has + * Copy the names, and fill in each snapshot's id + * and size. + * + * Note that rbd_dev_v1_header_read() guarantees the + * ondisk buffer we're working with has * snap_names_len bytes beyond the end of the * snapshot id array, this memcpy() is safe. */ - memcpy(header->snap_names, &ondisk->snaps[snap_count], - snap_names_len); - - /* Record each snapshot's size */ - - size = snap_count * sizeof (*header->snap_sizes); - header->snap_sizes = kmalloc(size, GFP_KERNEL); - if (!header->snap_sizes) - goto out_err; - for (i = 0; i < snap_count; i++) - header->snap_sizes[i] = - le64_to_cpu(ondisk->snaps[i].image_size); - } else { - header->snap_names = NULL; - header->snap_sizes = NULL; + memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); + snaps = ondisk->snaps; + for (i = 0; i < snap_count; i++) { + snapc->snaps[i] = le64_to_cpu(snaps[i].id); + snap_sizes[i] = le64_to_cpu(snaps[i].image_size); + } } - header->features = 0; /* No features support in v1 images */ - header->obj_order = ondisk->options.order; - header->crypt_type = ondisk->options.crypt_type; - header->comp_type = ondisk->options.comp_type; + /* We won't fail any more, fill in the header */ - /* Allocate and fill in the snapshot context */ + if (first_time) { + header->object_prefix = object_prefix; + header->obj_order = ondisk->options.order; + header->crypt_type = ondisk->options.crypt_type; + header->comp_type = ondisk->options.comp_type; + /* The rest aren't used for format 1 images */ + header->stripe_unit = 0; + header->stripe_count = 0; + header->features = 0; + } + + /* The remaining fields always get updated (when we refresh) */ header->image_size = le64_to_cpu(ondisk->image_size); - - header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); - if (!header->snapc) - goto out_err; - header->snapc->seq = le64_to_cpu(ondisk->snap_seq); - for (i = 0; i < snap_count; i++) - header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); + header->snapc = snapc; + header->snap_names = snap_names; + header->snap_sizes = snap_sizes; return 0; - +out_2big: + ret = -EIO; out_err: - kfree(header->snap_sizes); - header->snap_sizes = NULL; - kfree(header->snap_names); - header->snap_names = NULL; - kfree(header->object_prefix); - header->object_prefix = NULL; + kfree(snap_sizes); + kfree(snap_names); + ceph_put_snap_context(snapc); + kfree(object_prefix); - return -ENOMEM; + return ret; } static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) From 662518b128c27def65e9af4bea2b56a1e04b3251 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:29 -0500 Subject: [PATCH 12/33] rbd: update in-core header directly Now that rbd_header_from_disk() only fills in one-time fields once, we can extend it slightly so it releases the other fields before replacing their values. This way there's no need to pass a temporary buffer and then copy all the results in. Just use the rbd device header structure in rbd_header_from_disk() so its values get updated directly. Note that this means we need to take the header semaphore at the point we update things. So pass the rbd_dev rather than the address of its header as its first argument to rbd_header_from_disk(), and have it return an error code. As a result, rbd_dev_v1_header_read() does all the work, rbd_read_header() becomes unnecessary, and rbd_dev_v1_refresh() becomes a very simple wrapper. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 98 +++++++++++++-------------------------------- 1 file changed, 27 insertions(+), 71 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a3b6bf5e9ae8..e4586f2e04c2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -730,9 +730,10 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) * Fill an rbd image header with information from the given format 1 * on-disk header. */ -static int rbd_header_from_disk(struct rbd_image_header *header, +static int rbd_header_from_disk(struct rbd_device *rbd_dev, struct rbd_image_header_ondisk *ondisk) { + struct rbd_image_header *header = &rbd_dev->header; bool first_time = header->object_prefix == NULL; struct ceph_snap_context *snapc; char *object_prefix = NULL; @@ -802,6 +803,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header, /* We won't fail any more, fill in the header */ + down_write(&rbd_dev->header_rwsem); if (first_time) { header->object_prefix = object_prefix; header->obj_order = ondisk->options.order; @@ -811,6 +813,10 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->stripe_unit = 0; header->stripe_count = 0; header->features = 0; + } else { + ceph_put_snap_context(header->snapc); + kfree(header->snap_names); + kfree(header->snap_sizes); } /* The remaining fields always get updated (when we refresh) */ @@ -820,6 +826,14 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snap_names = snap_names; header->snap_sizes = snap_sizes; + /* Make sure mapping size is consistent with header info */ + + if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time) + if (rbd_dev->mapping.size != header->image_size) + rbd_dev->mapping.size = header->image_size; + + up_write(&rbd_dev->header_rwsem); + return 0; out_2big: ret = -EIO; @@ -3032,17 +3046,11 @@ out: } /* - * Read the complete header for the given rbd device. - * - * Returns a pointer to a dynamically-allocated buffer containing - * the complete and validated header. Caller can pass the address - * of a variable that will be filled in with the version of the - * header object at the time it was read. - * - * Returns a pointer-coded errno if a failure occurs. + * Read the complete header for the given rbd device. On successful + * return, the rbd_dev->header field will contain up-to-date + * information about the image. */ -static struct rbd_image_header_ondisk * -rbd_dev_v1_header_read(struct rbd_device *rbd_dev) +static int rbd_dev_v1_header_read(struct rbd_device *rbd_dev) { struct rbd_image_header_ondisk *ondisk = NULL; u32 snap_count = 0; @@ -3067,22 +3075,22 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev) size += names_size; ondisk = kmalloc(size, GFP_KERNEL); if (!ondisk) - return ERR_PTR(-ENOMEM); + return -ENOMEM; ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 0, size, ondisk); if (ret < 0) - goto out_err; + goto out; if ((size_t)ret < size) { ret = -ENXIO; rbd_warn(rbd_dev, "short header read (want %zd got %d)", size, ret); - goto out_err; + goto out; } if (!rbd_dev_ondisk_valid(ondisk)) { ret = -ENXIO; rbd_warn(rbd_dev, "invalid header"); - goto out_err; + goto out; } names_size = le64_to_cpu(ondisk->snap_names_len); @@ -3090,27 +3098,8 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev) snap_count = le32_to_cpu(ondisk->snap_count); } while (snap_count != want_count); - return ondisk; - -out_err: - kfree(ondisk); - - return ERR_PTR(ret); -} - -/* - * reload the ondisk the header - */ -static int rbd_read_header(struct rbd_device *rbd_dev, - struct rbd_image_header *header) -{ - struct rbd_image_header_ondisk *ondisk; - int ret; - - ondisk = rbd_dev_v1_header_read(rbd_dev); - if (IS_ERR(ondisk)) - return PTR_ERR(ondisk); - ret = rbd_header_from_disk(header, ondisk); + ret = rbd_header_from_disk(rbd_dev, ondisk); +out: kfree(ondisk); return ret; @@ -3121,40 +3110,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev, */ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) { - int ret; - struct rbd_image_header h; - - memset(&h, 0, sizeof (h)); - ret = rbd_read_header(rbd_dev, &h); - if (ret < 0) - return ret; - - down_write(&rbd_dev->header_rwsem); - - /* Update image size, and check for resize of mapped image */ - rbd_dev->header.image_size = h.image_size; - if (rbd_dev->spec->snap_id == CEPH_NOSNAP) - if (rbd_dev->mapping.size != rbd_dev->header.image_size) - rbd_dev->mapping.size = rbd_dev->header.image_size; - - /* rbd_dev->header.object_prefix shouldn't change */ - kfree(rbd_dev->header.snap_sizes); - kfree(rbd_dev->header.snap_names); - /* osd requests may still refer to snapc */ - ceph_put_snap_context(rbd_dev->header.snapc); - - rbd_dev->header.image_size = h.image_size; - rbd_dev->header.snapc = h.snapc; - rbd_dev->header.snap_names = h.snap_names; - rbd_dev->header.snap_sizes = h.snap_sizes; - /* Free the extra copy of the object prefix */ - if (strcmp(rbd_dev->header.object_prefix, h.object_prefix)) - rbd_warn(rbd_dev, "object prefix changed (ignoring)"); - kfree(h.object_prefix); - - up_write(&rbd_dev->header_rwsem); - - return ret; + return rbd_dev_v1_header_read(rbd_dev); } /* @@ -4517,7 +4473,7 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) /* Populate rbd image metadata */ - ret = rbd_read_header(rbd_dev, &rbd_dev->header); + ret = rbd_dev_v1_header_read(rbd_dev); if (ret < 0) goto out_err; From 30d60ba2f258da24b91edb880338c7178f901de9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:30 -0500 Subject: [PATCH 13/33] rbd: simplify rbd_dev_v1_probe() An rbd_dev structure's fields are all zero-filled for an initial probe, so there's no need to explicitly zero the parent_spec and parent_overlap fields in rbd_dev_v1_probe(). Removing these assignments makes rbd_dev_v1_probe() *almost* trivial. Move the dout() message that announces discovery of an image into rbd_dev_image_probe(), generalize to support images in either format and only show it if an image is fully discovered. This highlights that are some unnecessary cleanups in the error path for rbd_dev_v1_probe(), so they can be removed. Now rbd_dev_v1_probe() *is* a trivial wrapper function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 37 +++++++------------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e4586f2e04c2..ae223819bbf0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4469,31 +4469,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) { - int ret; - - /* Populate rbd image metadata */ - - ret = rbd_dev_v1_header_read(rbd_dev); - if (ret < 0) - goto out_err; - - /* Version 1 images have no parent (no layering) */ - - rbd_dev->parent_spec = NULL; - rbd_dev->parent_overlap = 0; - - dout("discovered version 1 image, header name is %s\n", - rbd_dev->header_name); - - return 0; - -out_err: - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; - kfree(rbd_dev->spec->image_id); - rbd_dev->spec->image_id = NULL; - - return ret; + return rbd_dev_v1_header_read(rbd_dev); } static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) @@ -4553,9 +4529,6 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) if (ret) goto out_err; - dout("discovered version 2 image, header name is %s\n", - rbd_dev->header_name); - return 0; out_err: rbd_dev->parent_overlap = 0; @@ -4758,9 +4731,13 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) rbd_dev->mapping.read_only = read_only; ret = rbd_dev_probe_parent(rbd_dev); - if (!ret) - return 0; + if (ret) + goto err_out_probe; + dout("discovered format %u image, header name is %s\n", + rbd_dev->image_format, rbd_dev->header_name); + + return 0; err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: From 99a41ebcee1a1ea0463b1b29d2e888de21a60c66 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:30 -0500 Subject: [PATCH 14/33] rbd: get rid of trivial v1 header wrappers Get rid of the trivial wrapper functions rbd_dev_v1_refresh() and rbd_dev_v1_probe(), substituting rbd_dev_v1_header_read() calls in their place. Rename rbd_dev_v1_header_read() to be rbd_dev_v1_header_info(), to be more generic (it will better reflect what happens with format 2 images). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ae223819bbf0..6748fe2d67d6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -788,7 +788,7 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, * Copy the names, and fill in each snapshot's id * and size. * - * Note that rbd_dev_v1_header_read() guarantees the + * Note that rbd_dev_v1_header_info() guarantees the * ondisk buffer we're working with has * snap_names_len bytes beyond the end of the * snapshot id array, this memcpy() is safe. @@ -3050,7 +3050,7 @@ out: * return, the rbd_dev->header field will contain up-to-date * information about the image. */ -static int rbd_dev_v1_header_read(struct rbd_device *rbd_dev) +static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) { struct rbd_image_header_ondisk *ondisk = NULL; u32 snap_count = 0; @@ -3105,14 +3105,6 @@ out: return ret; } -/* - * only read the first part of the ondisk header, without the snaps info - */ -static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) -{ - return rbd_dev_v1_header_read(rbd_dev); -} - /* * Clear the rbd device's EXISTS flag if the snapshot it's mapped to * has disappeared from the (just updated) snapshot context. @@ -3141,7 +3133,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) mapping_size = rbd_dev->mapping.size; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); if (rbd_dev->image_format == 1) - ret = rbd_dev_v1_refresh(rbd_dev); + ret = rbd_dev_v1_header_info(rbd_dev); else ret = rbd_dev_v2_refresh(rbd_dev); @@ -4467,11 +4459,6 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) memset(header, 0, sizeof (*header)); } -static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) -{ - return rbd_dev_v1_header_read(rbd_dev); -} - static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) { int ret; @@ -4714,7 +4701,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) goto out_header_name; if (rbd_dev->image_format == 1) - ret = rbd_dev_v1_probe(rbd_dev); + ret = rbd_dev_v1_header_info(rbd_dev); else ret = rbd_dev_v2_probe(rbd_dev); if (ret) From 2df3fac75851dc4257b90dc72fdd3cf27ba177bc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 09:51:30 -0500 Subject: [PATCH 15/33] rbd: define rbd_dev_v2_header_info() This rearranges rbd_dev_v2_refresh() so it works more like rbd_dev_v1_header_info(). While format 1 images need to read the whole header object to get any information, format 2 can collect almost all information selectively. So the one-time initialization will remain in a separate function--based on rbd_dev_v2_probe(). Rename rbd_dev_v2_refresh() to be rbd_dev_v2_header_info(), and have it call rbd_dev_v2_header_onetime() if it's being called for the first time for the given rbd device. Rename rbd_dev_v2_probe() to be rbd_dev_v2_header_onetime() and remove the image size and snapshot context calls it held in common with the refresh function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6748fe2d67d6..0d874a546949 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -425,7 +425,8 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request); static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); static int rbd_dev_refresh(struct rbd_device *rbd_dev); -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev); +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); +static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev); static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u64 snap_id); static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, @@ -3135,7 +3136,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) if (rbd_dev->image_format == 1) ret = rbd_dev_v1_header_info(rbd_dev); else - ret = rbd_dev_v2_refresh(rbd_dev); + ret = rbd_dev_v2_header_info(rbd_dev); /* If it's a mapped snapshot, validate its EXISTS flag */ @@ -4005,12 +4006,19 @@ out: return snap_name; } -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev) +static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) { + bool first_time = rbd_dev->header.object_prefix == NULL; int ret; down_write(&rbd_dev->header_rwsem); + if (first_time) { + ret = rbd_dev_v2_header_onetime(rbd_dev); + if (ret) + goto out; + } + ret = rbd_dev_v2_image_size(rbd_dev); if (ret) goto out; @@ -4459,22 +4467,18 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) memset(header, 0, sizeof (*header)); } -static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) { int ret; - ret = rbd_dev_v2_image_size(rbd_dev); - if (ret) - goto out_err; - - /* Get the object prefix (a.k.a. block_name) for the image */ - ret = rbd_dev_v2_object_prefix(rbd_dev); if (ret) goto out_err; - /* Get the and check features for the image */ - + /* + * Get the and check features for the image. Currently the + * features are assumed to never change. + */ ret = rbd_dev_v2_features(rbd_dev); if (ret) goto out_err; @@ -4504,17 +4508,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) if (ret < 0) goto out_err; } - - /* crypto and compression type aren't (yet) supported for v2 images */ - - rbd_dev->header.crypt_type = 0; - rbd_dev->header.comp_type = 0; - - /* Get the snapshot context, plus the header version */ - - ret = rbd_dev_v2_snap_context(rbd_dev); - if (ret) - goto out_err; + /* No support for crypto and compression type format 2 images */ return 0; out_err: @@ -4703,7 +4697,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) if (rbd_dev->image_format == 1) ret = rbd_dev_v1_header_info(rbd_dev); else - ret = rbd_dev_v2_probe(rbd_dev); + ret = rbd_dev_v2_header_info(rbd_dev); if (ret) goto err_out_watch; From 91c6febb3817be576785ef06aeaaa8ed423e0a2a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:32 -0500 Subject: [PATCH 16/33] rbd: fix an incorrect assertion condition In rbd_img_obj_parent_read_full_callback() there is an assertion intended to verify the size of the image request for a full parent read was the size of the original request's target object. But assertion was looking at the parent image order rather than the original one, and these values can differ. Fix that. This resolves: http://tracker.ceph.com/issues/4938 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0d874a546949..15ac2a54d4f3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2186,13 +2186,13 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) result = img_request->result; obj_size = img_request->length; xferred = img_request->xferred; + rbd_img_request_put(img_request); - rbd_dev = img_request->rbd_dev; + rbd_assert(orig_request->img_request); + rbd_dev = orig_request->img_request->rbd_dev; rbd_assert(rbd_dev); rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); - rbd_img_request_put(img_request); - if (result) goto out_err; From 5b2ab72d367d2682c1a237448fbc1595881a88fa Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: [PATCH 17/33] rbd: support reading parent page data Currently, rbd_img_parent_read() assumes the incoming object request contains bio data. But if a layered image is part of a multi-layer stack of images it will result in read requests of page data to parent images. Fortunately, it's not hard to add support for page data. This resolves: http://tracker.ceph.com/issues/4939 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 15ac2a54d4f3..2a0e9b81be48 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2574,7 +2574,7 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) rbd_assert(obj_request_img_data_test(obj_request)); rbd_assert(obj_request->img_request != NULL); rbd_assert(obj_request->result == (s32) -ENOENT); - rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + rbd_assert(obj_request_type_valid(obj_request->type)); rbd_dev = obj_request->img_request->rbd_dev; rbd_assert(rbd_dev->parent != NULL); @@ -2590,8 +2590,12 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) rbd_obj_request_get(obj_request); img_request->obj_request = obj_request; - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, - obj_request->bio_list); + if (obj_request->type == OBJ_REQUEST_BIO) + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, + obj_request->bio_list); + else + result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, + obj_request->pages); if (result) goto out_err; From 7ce4eef7b5fad73b365b7e4b8892af3af72d4bd3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: [PATCH 18/33] rbd: set mapping read-only flag in rbd_add() The rbd_dev->mapping field for a parent image is not meaningful. Since rbd_image_probe() is used both for images being mapped and their parents, it doesn't make sense to set that flag in that function. So move the setting of the mapping.read_only flag out of rbd_dev_image_probe() and into rbd_add() instead. This resolves: http://tracker.ceph.com/issues/4940 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2a0e9b81be48..dbfc44a9defd 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -358,7 +358,7 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); -static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -4549,7 +4549,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) if (!parent) goto out_err; - ret = rbd_dev_image_probe(parent, true); + ret = rbd_dev_image_probe(parent); if (ret < 0) goto out_err; rbd_dev->parent = parent; @@ -4671,10 +4671,9 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) /* * Probe for the existence of the header object for the given rbd - * device. For format 2 images this includes determining the image - * id. + * device. */ -static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) +static int rbd_dev_image_probe(struct rbd_device *rbd_dev) { int ret; int tmp; @@ -4709,12 +4708,6 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) if (ret) goto err_out_probe; - /* If we are mapping a snapshot it must be marked read-only */ - - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) - read_only = true; - rbd_dev->mapping.read_only = read_only; - ret = rbd_dev_probe_parent(rbd_dev); if (ret) goto err_out_probe; @@ -4795,10 +4788,16 @@ static ssize_t rbd_add(struct bus_type *bus, rbdc = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */ - rc = rbd_dev_image_probe(rbd_dev, read_only); + rc = rbd_dev_image_probe(rbd_dev); if (rc < 0) goto err_out_rbd_dev; + /* If we are mapping a snapshot it must be marked read-only */ + + if (rbd_dev->spec->snap_id != CEPH_NOSNAP) + read_only = true; + rbd_dev->mapping.read_only = read_only; + rc = rbd_dev_device_setup(rbd_dev); if (!rc) return count; From 1f3ef78861ac4b510175e177899b9b5ba4bbed91 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: [PATCH 19/33] rbd: only set up watch for mapped images Any changes to parent images are immaterial to any mapped clone. So there is no need to have a watch event registered on header objects except for the header object of an image that is mapped. In fact, a watch request is a write operation, and we may only have read access to a parent image. We can't set up the watch request until we know the name of the header object though. So pass a flag to rbd_dev_image_probe() to indicate whether this probe is for a mapping or for a parent image. Change the second parameter to rbd_dev_header_watch_sync() be Boolean while we're at it. This resolves: http://tracker.ceph.com/issues/4941 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index dbfc44a9defd..e417704de8ca 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -358,7 +358,7 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); -static int rbd_dev_image_probe(struct rbd_device *rbd_dev); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -2664,7 +2664,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) * Request sync osd watch/unwatch. The value of "start" determines * whether a watch request is being initiated or torn down. */ -static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; @@ -2698,7 +2698,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) rbd_dev->watch_request->osd_req); osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, 0, start); + rbd_dev->watch_event->cookie, 0, start ? 1 : 0); rbd_osd_req_format_write(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); @@ -4549,7 +4549,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) if (!parent) goto out_err; - ret = rbd_dev_image_probe(parent); + ret = rbd_dev_image_probe(parent, false); if (ret < 0) goto out_err; rbd_dev->parent = parent; @@ -4654,12 +4654,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) static void rbd_dev_image_release(struct rbd_device *rbd_dev) { - int ret; - rbd_dev_unprobe(rbd_dev); - ret = rbd_dev_header_watch_sync(rbd_dev, 0); - if (ret) - rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; rbd_dev->image_format = 0; @@ -4671,9 +4666,11 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) /* * Probe for the existence of the header object for the given rbd - * device. + * device. If this image is the one being mapped (i.e., not a + * parent), initiate a watch on its header object before using that + * object to get detailed information about the rbd image. */ -static int rbd_dev_image_probe(struct rbd_device *rbd_dev) +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) { int ret; int tmp; @@ -4693,9 +4690,11 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) if (ret) goto err_out_format; - ret = rbd_dev_header_watch_sync(rbd_dev, 1); - if (ret) - goto out_header_name; + if (mapping) { + ret = rbd_dev_header_watch_sync(rbd_dev, true); + if (ret) + goto out_header_name; + } if (rbd_dev->image_format == 1) ret = rbd_dev_v1_header_info(rbd_dev); @@ -4719,9 +4718,12 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: - tmp = rbd_dev_header_watch_sync(rbd_dev, 0); - if (tmp) - rbd_warn(rbd_dev, "unable to tear down watch request\n"); + if (mapping) { + tmp = rbd_dev_header_watch_sync(rbd_dev, false); + if (tmp) + rbd_warn(rbd_dev, "unable to tear down " + "watch request (%d)\n", tmp); + } out_header_name: kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; @@ -4788,7 +4790,7 @@ static ssize_t rbd_add(struct bus_type *bus, rbdc = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */ - rc = rbd_dev_image_probe(rbd_dev); + rc = rbd_dev_image_probe(rbd_dev, true); if (rc < 0) goto err_out_rbd_dev; @@ -4910,10 +4912,13 @@ static ssize_t rbd_remove(struct bus_type *bus, spin_unlock_irq(&rbd_dev->lock); if (ret < 0) goto done; - ret = count; rbd_bus_del_dev(rbd_dev); + ret = rbd_dev_header_watch_sync(rbd_dev, false); + if (ret) + rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); rbd_dev_image_release(rbd_dev); module_put(THIS_MODULE); + ret = count; done: mutex_unlock(&ctl_mutex); From c48f3f86e248b1649ad22151dd45ef2610101ed3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 8 May 2013 08:49:52 -0500 Subject: [PATCH 20/33] rbd: kill rbd_img_request_get() Get rid of rbd_img_request_get(), because it isn't used, and maybe won't ever be needed. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e417704de8ca..51c45e793354 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1357,13 +1357,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request) kref_put(&obj_request->kref, rbd_obj_request_destroy); } -static void rbd_img_request_get(struct rbd_img_request *img_request) -{ - dout("%s: img %p (was %d)\n", __func__, img_request, - atomic_read(&img_request->kref.refcount)); - kref_get(&img_request->kref); -} - static void rbd_img_request_destroy(struct kref *kref); static void rbd_img_request_put(struct rbd_img_request *img_request) { @@ -1888,9 +1881,6 @@ static struct rbd_img_request *rbd_img_request_create( INIT_LIST_HEAD(&img_request->obj_requests); kref_init(&img_request->kref); - rbd_img_request_get(img_request); /* Avoid a warning */ - rbd_img_request_put(img_request); /* TEMPORARY */ - dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, write_request ? "write" : "read", offset, length, img_request); From c10ebbf55b40503a46fb8b29824c9ca1ca089826 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 9 May 2013 14:56:32 -0500 Subject: [PATCH 21/33] libceph: init sent and completed when starting The rbd code has a need to be able to restart an osd request that has already been started and completed once before. This currently wouldn't work right because the osd client code assumes an osd request will be started exactly once Certain fields in a request are never cleared and this leads to trouble if you try to reuse it. Specifically, the r_sent, r_got_reply, and r_completed fields are never cleared. The r_sent field records the osd incarnation at the time the request was sent to that osd. If that's non-zero, the message won't get re-mapped to a target osd properly, and won't be put on the unsafe requests list the first time it's sent as it should. The r_got_reply field is used in handle_reply() to ensure the reply to a request is processed only once. And the r_completed field is used for lingering requests to avoid calling the callback function every time the osd client re-sends the request on behalf of its initiator. Each osd request passes through ceph_osdc_start_request() when responsibility for the request is handed over to the osd client for completion. We can safely zero these three fields there each time a request gets started. One last related change--clear the r_linger flag when a request is no longer registered as a linger request. This resolves: http://tracker.ceph.com/issues/5026 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a3395fdfbd4f..d5953b87918c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1204,6 +1204,7 @@ void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, mutex_lock(&osdc->request_mutex); if (req->r_linger) { __unregister_linger_request(osdc, req); + req->r_linger = 0; ceph_osdc_put_request(req); } mutex_unlock(&osdc->request_mutex); @@ -2120,7 +2121,9 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); __register_request(osdc, req); - WARN_ON(req->r_sent); + req->r_sent = 0; + req->r_got_reply = 0; + req->r_completed = 0; rc = __map_request(osdc, req, 0); if (rc < 0) { if (nofail) { From ebda6408f2227a774343c3cc2861384942143ff3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 10 May 2013 16:29:22 -0500 Subject: [PATCH 22/33] rbd: fix parent request size assumption The code that reads object data from the parent for a copyup on write request currently assumes that the size of that request is the size of a "full" object from the original target image. That is not necessarily the case. The parent overlap could reduce the request size below that. To fix that assumption we need to record the number of pages in the copyup_pages array, for both an image request and an object request. Rename a local variable in rbd_img_obj_parent_read_full_callback() to reflect we're recording the length of the parent read request, not the size of the target object. This resolves: http://tracker.ceph.com/issues/5038 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 51c45e793354..597b9bbe2fc7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -224,6 +224,7 @@ struct rbd_obj_request { }; }; struct page **copyup_pages; + u32 copyup_page_count; struct ceph_osd_request *osd_req; @@ -256,6 +257,7 @@ struct rbd_img_request { struct rbd_obj_request *obj_request; /* obj req initiator */ }; struct page **copyup_pages; + u32 copyup_page_count; spinlock_t completion_lock;/* protects next_completion */ u32 next_completion; rbd_img_callback_t callback; @@ -2119,7 +2121,7 @@ rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; struct rbd_device *rbd_dev; - u64 length; + struct page **pages; u32 page_count; rbd_assert(obj_request->type == OBJ_REQUEST_BIO); @@ -2129,12 +2131,14 @@ rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) rbd_dev = img_request->rbd_dev; rbd_assert(rbd_dev); - length = (u64)1 << rbd_dev->header.obj_order; - page_count = (u32)calc_pages_for(0, length); - rbd_assert(obj_request->copyup_pages); - ceph_release_page_vector(obj_request->copyup_pages, page_count); + pages = obj_request->copyup_pages; + rbd_assert(pages != NULL); obj_request->copyup_pages = NULL; + page_count = obj_request->copyup_page_count; + rbd_assert(page_count); + obj_request->copyup_page_count = 0; + ceph_release_page_vector(pages, page_count); /* * We want the transfer count to reflect the size of the @@ -2158,9 +2162,9 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) struct ceph_osd_client *osdc; struct rbd_device *rbd_dev; struct page **pages; + u32 page_count; int result; - u64 obj_size; - u64 xferred; + u64 parent_length; rbd_assert(img_request_child_test(img_request)); @@ -2169,19 +2173,21 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) pages = img_request->copyup_pages; rbd_assert(pages != NULL); img_request->copyup_pages = NULL; + page_count = img_request->copyup_page_count; + rbd_assert(page_count); + img_request->copyup_page_count = 0; orig_request = img_request->obj_request; rbd_assert(orig_request != NULL); rbd_assert(orig_request->type == OBJ_REQUEST_BIO); result = img_request->result; - obj_size = img_request->length; - xferred = img_request->xferred; + parent_length = img_request->length; + rbd_assert(parent_length == img_request->xferred); rbd_img_request_put(img_request); rbd_assert(orig_request->img_request); rbd_dev = orig_request->img_request->rbd_dev; rbd_assert(rbd_dev); - rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); if (result) goto out_err; @@ -2195,11 +2201,12 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) goto out_err; orig_request->osd_req = osd_req; orig_request->copyup_pages = pages; + orig_request->copyup_page_count = page_count; /* Initialize the copyup op */ osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); - osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0, + osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, false, false); /* Then the original write request op */ @@ -2312,6 +2319,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) if (result) goto out_err; parent_request->copyup_pages = pages; + parent_request->copyup_page_count = page_count; parent_request->callback = rbd_img_obj_parent_read_full_callback; result = rbd_img_request_submit(parent_request); @@ -2319,6 +2327,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) return 0; parent_request->copyup_pages = NULL; + parent_request->copyup_page_count = 0; parent_request->obj_request = NULL; rbd_obj_request_put(obj_request); out_err: From b91f09f17b2a302f07022e2f766969e2536d71b3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 10 May 2013 16:29:22 -0500 Subject: [PATCH 23/33] rbd: support reading parent page data for writes Currently, rbd_img_obj_parent_read_full() assumes the incoming object request contains bio data. But if a layered image is part of a multi-layer stack of images it will result in read requests of page data to parent images. This is handling the same kind of issue as was resolved by this commit: 5b2ab72d rbd: support reading parent page data This resolves: http://tracker.ceph.com/issues/5027 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 597b9bbe2fc7..5161e80a38ef 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2165,6 +2165,8 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) u32 page_count; int result; u64 parent_length; + u64 offset; + u64 length; rbd_assert(img_request_child_test(img_request)); @@ -2179,7 +2181,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) orig_request = img_request->obj_request; rbd_assert(orig_request != NULL); - rbd_assert(orig_request->type == OBJ_REQUEST_BIO); + rbd_assert(obj_request_type_valid(orig_request->type)); result = img_request->result; parent_length = img_request->length; rbd_assert(parent_length == img_request->xferred); @@ -2211,11 +2213,17 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) /* Then the original write request op */ + offset = orig_request->offset; + length = orig_request->length; osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, - orig_request->offset, - orig_request->length, 0, 0); - osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list, - orig_request->length); + offset, length, 0, 0); + if (orig_request->type == OBJ_REQUEST_BIO) + osd_req_op_extent_osd_data_bio(osd_req, 1, + orig_request->bio_list, length); + else + osd_req_op_extent_osd_data_pages(osd_req, 1, + orig_request->pages, length, + offset & ~PAGE_MASK, false, false); rbd_osd_req_format_write(orig_request); @@ -2261,7 +2269,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) int result; rbd_assert(obj_request_img_data_test(obj_request)); - rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + rbd_assert(obj_request_type_valid(obj_request->type)); img_request = obj_request->img_request; rbd_assert(img_request != NULL); From 70cf49cfc7a4d1eb4aeea6cd128b88230be9d0b1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: [PATCH 24/33] rbd: ignore zero-overlap parent An rbd clone image that has an overlap with its parent of 0 is effectively not a layered image at all. Detect this case and treat such an image as non-layered. Issue a warning to be sure the user knows what's going on. This resolves: http://tracker.ceph.com/issues/5028 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5161e80a38ef..b67ecda1e7ef 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3665,9 +3665,13 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); ceph_decode_64_safe(&p, end, overlap, out_err); - rbd_dev->parent_overlap = overlap; - rbd_dev->parent_spec = parent_spec; - parent_spec = NULL; /* rbd_dev now owns this */ + if (overlap) { + rbd_dev->parent_spec = parent_spec; + parent_spec = NULL; /* rbd_dev now owns this */ + rbd_dev->parent_overlap = overlap; + } else { + rbd_warn(rbd_dev, "ignoring parent of clone with overlap 0\n"); + } out: ret = 0; out_err: From 642a25375f4c863607d2170f4471aec8becf7788 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: [PATCH 25/33] rbd: get parent info on refresh Get parent info for format 2 images on every refresh (rather than just during the initial probe). This will be needed to detect the disappearance of the parent image in the event a mapped image becomes unlayered (i.e., flattened). Avoid leaking the previous parent spec on the second and subsequent times this information is requested by dropping the previous one (if any) before updating it. (Also, extract the pool id into a local variable before assigning it into the parent spec.) Switch to using a non-zero parent overlap value rather than the existence of a parent (a non-null parent_spec pointer) to determine whether to mark a request layered. It will soon be possible for a layered image to become unlayered while a request is in flight. This means that the layered flag for an image request indicates that there was a non-zero parent overlap at the time the image request was created. The parent overlap can change thereafter, which may lead to special handling at request submission or completion time. This and the next several patches are related to: http://tracker.ceph.com/issues/3763 NOTE: If an error occurs while refreshing the parent info (i.e., requesting it after initial probe), the old parent info will persist. This is not really correct, and is a scenario that needs to be addressed. For now we'll assert that the failure mode is unlikely, but the issue has been documented in tracker issue 5040. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 67 +++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b67ecda1e7ef..fcef63c2c30b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1873,7 +1873,7 @@ static struct rbd_img_request *rbd_img_request_create( } if (child_request) img_request_child_set(img_request); - if (rbd_dev->parent_spec) + if (rbd_dev->parent_overlap) img_request_layered_set(img_request); spin_lock_init(&img_request->completion_lock); img_request->next_completion = 0; @@ -3613,6 +3613,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) __le64 snapid; void *p; void *end; + u64 pool_id; char *image_id; u64 overlap; int ret; @@ -3643,18 +3644,19 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) p = reply_buf; end = reply_buf + ret; ret = -ERANGE; - ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); - if (parent_spec->pool_id == CEPH_NOPOOL) + ceph_decode_64_safe(&p, end, pool_id, out_err); + if (pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ /* The ceph file layout needs to fit pool id in 32 bits */ ret = -EIO; - if (parent_spec->pool_id > (u64)U32_MAX) { + if (pool_id > (u64)U32_MAX) { rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", - (unsigned long long)parent_spec->pool_id, U32_MAX); + (unsigned long long)pool_id, U32_MAX); goto out_err; } + parent_spec->pool_id = pool_id; image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { @@ -3666,6 +3668,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) ceph_decode_64_safe(&p, end, overlap, out_err); if (overlap) { + rbd_spec_put(rbd_dev->parent_spec); rbd_dev->parent_spec = parent_spec; parent_spec = NULL; /* rbd_dev now owns this */ rbd_dev->parent_overlap = overlap; @@ -4034,17 +4037,43 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) goto out; } + /* + * If the image supports layering, get the parent info. We + * need to probe the first time regardless. Thereafter we + * only need to if there's a parent, to see if it has + * disappeared due to the mapped image getting flattened. + */ + if (rbd_dev->header.features & RBD_FEATURE_LAYERING && + (first_time || rbd_dev->parent_spec)) { + bool warn; + + ret = rbd_dev_v2_parent_info(rbd_dev); + if (ret) + goto out; + + /* + * Print a warning if this is the initial probe and + * the image has a parent. Don't print it if the + * image now being probed is itself a parent. We + * can tell at this point because we won't know its + * pool name yet (just its pool id). + */ + warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name; + if (first_time && warn) + rbd_warn(rbd_dev, "WARNING: kernel layering " + "is EXPERIMENTAL!"); + } + ret = rbd_dev_v2_image_size(rbd_dev); if (ret) goto out; + if (rbd_dev->spec->snap_id == CEPH_NOSNAP) if (rbd_dev->mapping.size != rbd_dev->header.image_size) rbd_dev->mapping.size = rbd_dev->header.image_size; ret = rbd_dev_v2_snap_context(rbd_dev); dout("rbd_dev_v2_snap_context returned %d\n", ret); - if (ret) - goto out; out: up_write(&rbd_dev->header_rwsem); @@ -4498,24 +4527,6 @@ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) if (ret) goto out_err; - /* If the image supports layering, get the parent info */ - - if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { - ret = rbd_dev_v2_parent_info(rbd_dev); - if (ret) - goto out_err; - /* - * Print a warning if this image has a parent. - * Don't print it if the image now being probed - * is itself a parent. We can tell at this point - * because we won't know its pool name yet (just its - * pool id). - */ - if (rbd_dev->parent_spec && rbd_dev->spec->pool_name) - rbd_warn(rbd_dev, "WARNING: kernel layering " - "is EXPERIMENTAL!"); - } - /* If the image supports fancy striping, get its parameters */ if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { @@ -4527,11 +4538,7 @@ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) return 0; out_err: - rbd_dev->parent_overlap = 0; - rbd_spec_put(rbd_dev->parent_spec); - rbd_dev->parent_spec = NULL; - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; + rbd_dev->header.features = 0; kfree(rbd_dev->header.object_prefix); rbd_dev->header.object_prefix = NULL; From 8785b1d487f0a31afd2c802499786d3b355eccea Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 9 May 2013 10:08:49 -0500 Subject: [PATCH 26/33] rbd: don't release write request until necessary Previously when a layered write was going to involve a copyup request, the original osd request was released before submitting the parent full-object read. The osd request for the copyup would then be allocated in rbd_img_obj_parent_read_full_callback(). Shortly we will be handling the event of mapped layered images getting flattened, and when that occurs we need to resubmit the original request. We therefore don't want to release the osd request until we really konw we're going to replace it--in the callback function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fcef63c2c30b..d861c71b4005 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2194,13 +2194,17 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) if (result) goto out_err; - /* Allocate the new copyup osd request for the original request */ - + /* + * The original osd request is of no use to use any more. + * We need a new one that can hold the two ops in a copyup + * request. Allocate the new copyup osd request for the + * original request, and release the old one. + */ result = -ENOMEM; - rbd_assert(!orig_request->osd_req); osd_req = rbd_osd_req_create_copyup(orig_request); if (!osd_req) goto out_err; + rbd_osd_req_destroy(orig_request->osd_req); orig_request->osd_req = osd_req; orig_request->copyup_pages = pages; orig_request->copyup_page_count = page_count; @@ -2276,15 +2280,6 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) rbd_dev = img_request->rbd_dev; rbd_assert(rbd_dev->parent != NULL); - /* - * First things first. The original osd request is of no - * use to use any more, we'll need a new one that can hold - * the two ops in a copyup request. We'll get that later, - * but for now we can release the old one. - */ - rbd_osd_req_destroy(obj_request->osd_req); - obj_request->osd_req = NULL; - /* * Determine the byte range covered by the object in the * child image to which the original request was to be sent. From fb65d2284c117cfc28d30217d25a14a8e7a75a94 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 8 May 2013 22:50:04 -0500 Subject: [PATCH 27/33] rbd: define rbd_dev_unparent() Define rbd_dev_unparent() to encapsulate cleaning up parent data structures from a layered rbd image. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d861c71b4005..9c2b20a88be2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1837,6 +1837,17 @@ static void rbd_obj_request_destroy(struct kref *kref) kmem_cache_free(rbd_obj_request_cache, obj_request); } +/* It's OK to call this for a device with no parent */ + +static void rbd_spec_put(struct rbd_spec *spec); +static void rbd_dev_unparent(struct rbd_device *rbd_dev) +{ + rbd_dev_remove_parent(rbd_dev); + rbd_spec_put(rbd_dev->parent_spec); + rbd_dev->parent_spec = NULL; + rbd_dev->parent_overlap = 0; +} + /* * Caller is responsible for filling in the list of object requests * that comprises the image request, and the Linux request pointer @@ -4491,10 +4502,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { struct rbd_image_header *header; - rbd_dev_remove_parent(rbd_dev); - rbd_spec_put(rbd_dev->parent_spec); - rbd_dev->parent_spec = NULL; - rbd_dev->parent_overlap = 0; + rbd_dev_unparent(rbd_dev); /* Free dynamic fields from the header, then zero it out */ @@ -4570,7 +4578,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) return 0; out_err: if (parent) { - rbd_spec_put(rbd_dev->parent_spec); + rbd_dev_unparent(rbd_dev); kfree(rbd_dev->header_name); rbd_dev_destroy(parent); } else { From e93f3152357ca75284284bef8eeea7d45fe1bab1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 8 May 2013 22:50:04 -0500 Subject: [PATCH 28/33] rbd: define parent image request routines Define rbd_parent_request_create() and rbd_parent_request_destroy() to handle the creation of parent image requests submitted for layered image objects. For simplicity, let rbd_img_request_put() handle dropping the reference to any image request (parent or not), and call whichever destructor is appropriate on the last put. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 78 ++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 23 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9c2b20a88be2..1ffdfbfbf3c4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1359,13 +1359,18 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request) kref_put(&obj_request->kref, rbd_obj_request_destroy); } +static bool img_request_child_test(struct rbd_img_request *img_request); +static void rbd_parent_request_destroy(struct kref *kref); static void rbd_img_request_destroy(struct kref *kref); static void rbd_img_request_put(struct rbd_img_request *img_request) { rbd_assert(img_request != NULL); dout("%s: img %p (was %d)\n", __func__, img_request, atomic_read(&img_request->kref.refcount)); - kref_put(&img_request->kref, rbd_img_request_destroy); + if (img_request_child_test(img_request)) + kref_put(&img_request->kref, rbd_parent_request_destroy); + else + kref_put(&img_request->kref, rbd_img_request_destroy); } static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, @@ -1482,6 +1487,12 @@ static void img_request_child_set(struct rbd_img_request *img_request) smp_mb(); } +static void img_request_child_clear(struct rbd_img_request *img_request) +{ + clear_bit(IMG_REQ_CHILD, &img_request->flags); + smp_mb(); +} + static bool img_request_child_test(struct rbd_img_request *img_request) { smp_mb(); @@ -1856,8 +1867,7 @@ static void rbd_dev_unparent(struct rbd_device *rbd_dev) static struct rbd_img_request *rbd_img_request_create( struct rbd_device *rbd_dev, u64 offset, u64 length, - bool write_request, - bool child_request) + bool write_request) { struct rbd_img_request *img_request; @@ -1882,8 +1892,6 @@ static struct rbd_img_request *rbd_img_request_create( } else { img_request->snap_id = rbd_dev->spec->snap_id; } - if (child_request) - img_request_child_set(img_request); if (rbd_dev->parent_overlap) img_request_layered_set(img_request); spin_lock_init(&img_request->completion_lock); @@ -1918,12 +1926,46 @@ static void rbd_img_request_destroy(struct kref *kref) if (img_request_write_test(img_request)) ceph_put_snap_context(img_request->snapc); - if (img_request_child_test(img_request)) - rbd_obj_request_put(img_request->obj_request); - kmem_cache_free(rbd_img_request_cache, img_request); } +static struct rbd_img_request *rbd_parent_request_create( + struct rbd_obj_request *obj_request, + u64 img_offset, u64 length) +{ + struct rbd_img_request *parent_request; + struct rbd_device *rbd_dev; + + rbd_assert(obj_request->img_request); + rbd_dev = obj_request->img_request->rbd_dev; + + parent_request = rbd_img_request_create(rbd_dev->parent, + img_offset, length, false); + if (!parent_request) + return NULL; + + img_request_child_set(parent_request); + rbd_obj_request_get(obj_request); + parent_request->obj_request = obj_request; + + return parent_request; +} + +static void rbd_parent_request_destroy(struct kref *kref) +{ + struct rbd_img_request *parent_request; + struct rbd_obj_request *orig_request; + + parent_request = container_of(kref, struct rbd_img_request, kref); + orig_request = parent_request->obj_request; + + parent_request->obj_request = NULL; + rbd_obj_request_put(orig_request); + img_request_child_clear(parent_request); + + rbd_img_request_destroy(kref); +} + static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; @@ -2321,13 +2363,10 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) } result = -ENOMEM; - parent_request = rbd_img_request_create(rbd_dev->parent, - img_offset, length, - false, true); + parent_request = rbd_parent_request_create(obj_request, + img_offset, length); if (!parent_request) goto out_err; - rbd_obj_request_get(obj_request); - parent_request->obj_request = obj_request; result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); if (result) @@ -2580,7 +2619,6 @@ out: static void rbd_img_parent_read(struct rbd_obj_request *obj_request) { - struct rbd_device *rbd_dev; struct rbd_img_request *img_request; int result; @@ -2589,20 +2627,14 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) rbd_assert(obj_request->result == (s32) -ENOENT); rbd_assert(obj_request_type_valid(obj_request->type)); - rbd_dev = obj_request->img_request->rbd_dev; - rbd_assert(rbd_dev->parent != NULL); /* rbd_read_finish(obj_request, obj_request->length); */ - img_request = rbd_img_request_create(rbd_dev->parent, + img_request = rbd_parent_request_create(obj_request, obj_request->img_offset, - obj_request->length, - false, true); + obj_request->length); result = -ENOMEM; if (!img_request) goto out_err; - rbd_obj_request_get(obj_request); - img_request->obj_request = obj_request; - if (obj_request->type == OBJ_REQUEST_BIO) result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, obj_request->bio_list); @@ -2913,7 +2945,7 @@ static void rbd_request_fn(struct request_queue *q) result = -ENOMEM; img_request = rbd_img_request_create(rbd_dev, offset, length, - write_request, false); + write_request); if (!img_request) goto end_request; From a2acd00e7964dbb1668a5956c9d0a4bdeb838c4a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 8 May 2013 22:50:04 -0500 Subject: [PATCH 29/33] rbd: reference count parent requests Keep a reference count for uses of the parent information for an rbd device. An initial reference is set in rbd_img_request_create() if the target image has a parent (with non-zero overlap). Each image request for an image with a non-zero parent overlap gets another reference when it's created, and that reference is dropped when the request is destroyed. The initial reference is dropped when the image gets torn down. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 104 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1ffdfbfbf3c4..8b6091b6d5cb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -55,6 +55,39 @@ #define SECTOR_SHIFT 9 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) +/* + * Increment the given counter and return its updated value. + * If the counter is already 0 it will not be incremented. + * If the counter is already at its maximum value returns + * -EINVAL without updating it. + */ +static int atomic_inc_return_safe(atomic_t *v) +{ + unsigned int counter; + + counter = (unsigned int)__atomic_add_unless(v, 1, 0); + if (counter <= (unsigned int)INT_MAX) + return (int)counter; + + atomic_dec(v); + + return -EINVAL; +} + +/* Decrement the counter. Return the resulting value, or -EINVAL */ +static int atomic_dec_return_safe(atomic_t *v) +{ + int counter; + + counter = atomic_dec_return(v); + if (counter >= 0) + return counter; + + atomic_inc(v); + + return -EINVAL; +} + #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -312,6 +345,7 @@ struct rbd_device { struct rbd_spec *parent_spec; u64 parent_overlap; + atomic_t parent_ref; struct rbd_device *parent; /* protects updating the header */ @@ -361,6 +395,7 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); +static void rbd_spec_put(struct rbd_spec *spec); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -1505,6 +1540,12 @@ static void img_request_layered_set(struct rbd_img_request *img_request) smp_mb(); } +static void img_request_layered_clear(struct rbd_img_request *img_request) +{ + clear_bit(IMG_REQ_LAYERED, &img_request->flags); + smp_mb(); +} + static bool img_request_layered_test(struct rbd_img_request *img_request) { smp_mb(); @@ -1859,6 +1900,58 @@ static void rbd_dev_unparent(struct rbd_device *rbd_dev) rbd_dev->parent_overlap = 0; } +/* + * Parent image reference counting is used to determine when an + * image's parent fields can be safely torn down--after there are no + * more in-flight requests to the parent image. When the last + * reference is dropped, cleaning them up is safe. + */ +static void rbd_dev_parent_put(struct rbd_device *rbd_dev) +{ + int counter; + + if (!rbd_dev->parent_spec) + return; + + counter = atomic_dec_return_safe(&rbd_dev->parent_ref); + if (counter > 0) + return; + + /* Last reference; clean up parent data structures */ + + if (!counter) + rbd_dev_unparent(rbd_dev); + else + rbd_warn(rbd_dev, "parent reference underflow\n"); +} + +/* + * If an image has a non-zero parent overlap, get a reference to its + * parent. + * + * Returns true if the rbd device has a parent with a non-zero + * overlap and a reference for it was successfully taken, or + * false otherwise. + */ +static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) +{ + int counter; + + if (!rbd_dev->parent_spec) + return false; + + counter = atomic_inc_return_safe(&rbd_dev->parent_ref); + if (counter > 0 && rbd_dev->parent_overlap) + return true; + + /* Image was flattened, but parent is not yet torn down */ + + if (counter < 0) + rbd_warn(rbd_dev, "parent reference overflow\n"); + + return false; +} + /* * Caller is responsible for filling in the list of object requests * that comprises the image request, and the Linux request pointer @@ -1892,7 +1985,7 @@ static struct rbd_img_request *rbd_img_request_create( } else { img_request->snap_id = rbd_dev->spec->snap_id; } - if (rbd_dev->parent_overlap) + if (rbd_dev_parent_get(rbd_dev)) img_request_layered_set(img_request); spin_lock_init(&img_request->completion_lock); img_request->next_completion = 0; @@ -1923,6 +2016,11 @@ static void rbd_img_request_destroy(struct kref *kref) rbd_img_obj_request_del(img_request, obj_request); rbd_assert(img_request->obj_request_count == 0); + if (img_request_layered_test(img_request)) { + img_request_layered_clear(img_request); + rbd_dev_parent_put(img_request->rbd_dev); + } + if (img_request_write_test(img_request)) ceph_put_snap_context(img_request->snapc); @@ -3502,6 +3600,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, spin_lock_init(&rbd_dev->lock); rbd_dev->flags = 0; + atomic_set(&rbd_dev->parent_ref, 0); INIT_LIST_HEAD(&rbd_dev->node); init_rwsem(&rbd_dev->header_rwsem); @@ -4534,7 +4633,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { struct rbd_image_header *header; - rbd_dev_unparent(rbd_dev); + rbd_dev_parent_put(rbd_dev); /* Free dynamic fields from the header, then zero it out */ @@ -4606,6 +4705,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) if (ret < 0) goto out_err; rbd_dev->parent = parent; + atomic_set(&rbd_dev->parent_ref, 1); return 0; out_err: From 392a9dad7e777296fe79d97a6b3acd735ad2eb5f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: [PATCH 30/33] rbd: detect when clone image is flattened A format 2 clone image can be the subject of a "flatten" operation, during which all of its data gets "copied up" from its parent image, leaving the image fully populated. Once this is complete, the clone's association with the parent is abolished. Since this can occur when a clone is mapped, we need to detect when it has occurred and handle it accordingly. We know an image has been flattened when we know it at one time had a parent, but we have learned (via a "get_parent" object class method call) it no longer has one. There might be in-flight requests at the point we learn an image has been flattened, so we can't simply clean up parent data structures right away. Instead, we'll drop the initial parent reference when the parent has disappeared (rather than when the image gets destroyed), which will allow the last in-flight reference to clean things up when it's complete. We leverage the fact that a zero parent overlap renders an image effectively unlayered. We set the overlap to 0 at the point we detect the clone image has flattened, which allows the unlayered behavior to take effect immediately, while keeping other parent structures in place until in-flight requests to complete. This and the next few patches resolve: http://tracker.ceph.com/issues/3763 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8b6091b6d5cb..9717e20f3477 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1929,6 +1929,11 @@ static void rbd_dev_parent_put(struct rbd_device *rbd_dev) * If an image has a non-zero parent overlap, get a reference to its * parent. * + * We must get the reference before checking for the overlap to + * coordinate properly with zeroing the parent overlap in + * rbd_dev_v2_parent_info() when an image gets flattened. We + * drop it again if there is no overlap. + * * Returns true if the rbd device has a parent with a non-zero * overlap and a reference for it was successfully taken, or * false otherwise. @@ -3782,8 +3787,26 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) end = reply_buf + ret; ret = -ERANGE; ceph_decode_64_safe(&p, end, pool_id, out_err); - if (pool_id == CEPH_NOPOOL) + if (pool_id == CEPH_NOPOOL) { + /* + * Either the parent never existed, or we have + * record of it but the image got flattened so it no + * longer has a parent. When the parent of a + * layered image disappears we immediately set the + * overlap to 0. The effect of this is that all new + * requests will be treated as if the image had no + * parent. + */ + if (rbd_dev->parent_overlap) { + rbd_dev->parent_overlap = 0; + smp_mb(); + rbd_dev_parent_put(rbd_dev); + pr_info("%s: clone image has been flattened\n", + rbd_dev->disk->disk_name); + } + goto out; /* No parent? No problem. */ + } /* The ceph file layout needs to fit pool id in 32 bits */ @@ -4633,7 +4656,10 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { struct rbd_image_header *header; - rbd_dev_parent_put(rbd_dev); + /* Drop parent reference unless it's already been done (or none) */ + + if (rbd_dev->parent_overlap) + rbd_dev_parent_put(rbd_dev); /* Free dynamic fields from the header, then zero it out */ From 02c74fbad9d4a5149756eb35be7300736e4904e9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: [PATCH 31/33] rbd: re-submit read request for flattened clone If a clone image gets flattened while a parent read request is underway, the original rbd object request needs to be resubmitted. The reason is that by the time we get the response to the parent read request, the data read from the parent may be out of date. In other words, we could see this sequence of events: rbd client parent image/osd ---------- ---------------- original object ENOENT; issue parent read respond to parent read child image flattened original image header refresh <--- original object written independently here parent read response received Add code to rbd_img_parent_read_callback() to detect when a clone's parent image has disappeared (as evidenced by its parent overlap becoming 0), and re-submit the original read request in that case. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9717e20f3477..4edcb6d85f01 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2682,14 +2682,36 @@ static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) struct rbd_obj_request *obj_request; struct rbd_device *rbd_dev; u64 obj_end; + u64 img_xferred; + int img_result; rbd_assert(img_request_child_test(img_request)); + /* First get what we need from the image request and release it */ + obj_request = img_request->obj_request; + img_xferred = img_request->xferred; + img_result = img_request->result; + rbd_img_request_put(img_request); + + /* + * If the overlap has become 0 (most likely because the + * image has been flattened) we need to re-submit the + * original request. + */ rbd_assert(obj_request); rbd_assert(obj_request->img_request); + rbd_dev = obj_request->img_request->rbd_dev; + if (!rbd_dev->parent_overlap) { + struct ceph_osd_client *osdc; - obj_request->result = img_request->result; + osdc = &rbd_dev->rbd_client->client->osdc; + img_result = rbd_obj_request_submit(osdc, obj_request); + if (!img_result) + return; + } + + obj_request->result = img_result; if (obj_request->result) goto out; @@ -2702,7 +2724,6 @@ static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) */ rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); obj_end = obj_request->img_offset + obj_request->length; - rbd_dev = obj_request->img_request->rbd_dev; if (obj_end > rbd_dev->parent_overlap) { u64 xferred = 0; @@ -2710,12 +2731,11 @@ static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) xferred = rbd_dev->parent_overlap - obj_request->img_offset; - obj_request->xferred = min(img_request->xferred, xferred); + obj_request->xferred = min(img_xferred, xferred); } else { - obj_request->xferred = img_request->xferred; + obj_request->xferred = img_xferred; } out: - rbd_img_request_put(img_request); rbd_img_obj_request_read_callback(obj_request); rbd_obj_request_complete(obj_request); } From bbea1c1a31b1128d34b2b5b4f28276969cce14e9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: [PATCH 32/33] rbd: re-submit write request for flattened clone Add code to rbd_img_parent_read_full_callback() to detect when a clone's parent image has disappeared, and re-submit the original write request in that case. (See the previous commit for more reasoning about why this is appropriate.) Rename some variables in rbd_img_obj_parent_read_full_callback() to match the convention used in the previous patch. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4edcb6d85f01..5c2731859e8a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2319,7 +2319,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) struct rbd_device *rbd_dev; struct page **pages; u32 page_count; - int result; + int img_result; u64 parent_length; u64 offset; u64 length; @@ -2338,7 +2338,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) orig_request = img_request->obj_request; rbd_assert(orig_request != NULL); rbd_assert(obj_request_type_valid(orig_request->type)); - result = img_request->result; + img_result = img_request->result; parent_length = img_request->length; rbd_assert(parent_length == img_request->xferred); rbd_img_request_put(img_request); @@ -2347,7 +2347,22 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) rbd_dev = orig_request->img_request->rbd_dev; rbd_assert(rbd_dev); - if (result) + /* + * If the overlap has become 0 (most likely because the + * image has been flattened) we need to free the pages + * and re-submit the original write request. + */ + if (!rbd_dev->parent_overlap) { + struct ceph_osd_client *osdc; + + ceph_release_page_vector(pages, page_count); + osdc = &rbd_dev->rbd_client->client->osdc; + img_result = rbd_obj_request_submit(osdc, orig_request); + if (!img_result) + return; + } + + if (img_result) goto out_err; /* @@ -2356,7 +2371,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) * request. Allocate the new copyup osd request for the * original request, and release the old one. */ - result = -ENOMEM; + img_result = -ENOMEM; osd_req = rbd_osd_req_create_copyup(orig_request); if (!osd_req) goto out_err; @@ -2391,13 +2406,13 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) orig_request->callback = rbd_img_obj_copyup_callback; osdc = &rbd_dev->rbd_client->client->osdc; - result = rbd_obj_request_submit(osdc, orig_request); - if (!result) + img_result = rbd_obj_request_submit(osdc, orig_request); + if (!img_result) return; out_err: /* Record the error code and complete the request */ - orig_request->result = result; + orig_request->result = img_result; orig_request->xferred = 0; obj_request_done_set(orig_request); rbd_obj_request_complete(orig_request); From 638f5abed3f7d8a7fc24087bd760fa3d99f68a39 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 May 2013 17:40:33 -0500 Subject: [PATCH 33/33] rbd: re-submit flattened write request (part 2) Add code to rbd_img_obj_exists_callback() to detect when a clone's parent image has disappeared, and re-submit the original write request in that case. Kill off some redundant assertions. This completes the resolution for: http://tracker.ceph.com/issues/3763 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5c2731859e8a..6b872f219774 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2516,6 +2516,7 @@ out_err: static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) { struct rbd_obj_request *orig_request; + struct rbd_device *rbd_dev; int result; rbd_assert(!obj_request_img_data_test(obj_request)); @@ -2538,8 +2539,21 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) obj_request->xferred, obj_request->length); rbd_obj_request_put(obj_request); - rbd_assert(orig_request); - rbd_assert(orig_request->img_request); + /* + * If the overlap has become 0 (most likely because the + * image has been flattened) we need to free the pages + * and re-submit the original write request. + */ + rbd_dev = orig_request->img_request->rbd_dev; + if (!rbd_dev->parent_overlap) { + struct ceph_osd_client *osdc; + + rbd_obj_request_put(orig_request); + osdc = &rbd_dev->rbd_client->client->osdc; + result = rbd_obj_request_submit(osdc, orig_request); + if (!result) + return; + } /* * Our only purpose here is to determine whether the object