From 8bf8c376ab2eefaf0386f4e003e720e1434fa43d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 3 Mar 2010 06:28:06 +0300 Subject: [PATCH 01/34] blkdev: fix merge_bvec_fn return value checks v2 merge_bvec_fn() returns bvec->bv_len on success. So we have to check against this value. But in case of fs_optimization merge we compare with wrong value. This patch must be included in b428cd6da7e6559aca69aa2e3a526037d3f20403 But accidentally i've forgot to add this in the initial patch. To make things straight let's replace all such checks. In fact this makes code easy to understand. Signed-off-by: Dmitry Monakhov Signed-off-by: Jens Axboe --- fs/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index dc17afd672e3..d89e0199dc0d 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -555,7 +555,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page .bi_rw = bio->bi_rw, }; - if (q->merge_bvec_fn(q, &bvm, prev) < len) { + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { prev->bv_len -= len; return 0; } @@ -608,7 +608,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, &bvm, bvec) < len) { + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { bvec->bv_page = NULL; bvec->bv_len = 0; bvec->bv_offset = 0; From c2282adbdea3548ae0271c1b1b2deec5f56ad224 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 8 Mar 2010 09:11:07 +0100 Subject: [PATCH 02/34] Documentation: fix block/biodoc.txt dma mapping description - It looks incorrect to use blk_rq_map_sg with pci_map_page here about DMA mappings. dma_map_sg? - better to use dma_map_page instead of pci_map_page. http://marc.info/?l=linux-kernel&m=126596737604808&w=2 Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 6fab97ea7e6b..508b5b2b0289 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -1162,8 +1162,8 @@ where a driver received a request ala this before: As mentioned, there is no virtual mapping of a bio. For DMA, this is not a problem as the driver probably never will need a virtual mapping. -Instead it needs a bus mapping (pci_map_page for a single segment or -use blk_rq_map_sg for scatter gather) to be able to ship it to the driver. For +Instead it needs a bus mapping (dma_map_page for a single segment or +use dma_map_sg for scatter gather) to be able to ship it to the driver. For PIO drivers (or drivers that need to revert to PIO transfer once in a while (IDE for example)), where the CPU is doing the actual data transfer a virtual mapping is needed. If the driver supports highmem I/O, From 881245dcff29df992d8431392a41fb81549129f9 Mon Sep 17 00:00:00 2001 From: William Cohen Date: Tue, 9 Mar 2010 09:26:04 +0100 Subject: [PATCH 03/34] Add DocBook documentation for the block tracepoints. This patch adds a simple description of the various block tracepoints available in the kernel. Signed-off-by: William Cohen Acked-by: Randy Dunlap Signed-off-by: Jens Axboe --- Documentation/DocBook/tracepoint.tmpl | 13 ++ include/trace/events/block.h | 164 ++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl index 8bca1d5cec09..e8473eae2a20 100644 --- a/Documentation/DocBook/tracepoint.tmpl +++ b/Documentation/DocBook/tracepoint.tmpl @@ -16,6 +16,15 @@ + + William + Cohen + +
+ wcohen@redhat.com +
+
+
@@ -91,4 +100,8 @@ !Iinclude/trace/events/signal.h + + Block IO +!Iinclude/trace/events/block.h + diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 5fb72733331e..d870a918559c 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -40,6 +40,16 @@ DECLARE_EVENT_CLASS(block_rq_with_error, __entry->nr_sector, __entry->errors) ); +/** + * block_rq_abort - abort block operation request + * @q: queue containing the block operation request + * @rq: block IO operation request + * + * Called immediately after pending block IO operation request @rq in + * queue @q is aborted. The fields in the operation request @rq + * can be examined to determine which device and sectors the pending + * operation would access. + */ DEFINE_EVENT(block_rq_with_error, block_rq_abort, TP_PROTO(struct request_queue *q, struct request *rq), @@ -47,6 +57,15 @@ DEFINE_EVENT(block_rq_with_error, block_rq_abort, TP_ARGS(q, rq) ); +/** + * block_rq_requeue - place block IO request back on a queue + * @q: queue holding operation + * @rq: block IO operation request + * + * The block operation request @rq is being placed back into queue + * @q. For some reason the request was not completed and needs to be + * put back in the queue. + */ DEFINE_EVENT(block_rq_with_error, block_rq_requeue, TP_PROTO(struct request_queue *q, struct request *rq), @@ -54,6 +73,17 @@ DEFINE_EVENT(block_rq_with_error, block_rq_requeue, TP_ARGS(q, rq) ); +/** + * block_rq_complete - block IO operation completed by device driver + * @q: queue containing the block operation request + * @rq: block operations request + * + * The block_rq_complete tracepoint event indicates that some portion + * of operation request has been completed by the device driver. If + * the @rq->bio is %NULL, then there is absolutely no additional work to + * do for the request. If @rq->bio is non-NULL then there is + * additional work required to complete the request. + */ DEFINE_EVENT(block_rq_with_error, block_rq_complete, TP_PROTO(struct request_queue *q, struct request *rq), @@ -95,6 +125,16 @@ DECLARE_EVENT_CLASS(block_rq, __entry->nr_sector, __entry->comm) ); +/** + * block_rq_insert - insert block operation request into queue + * @q: target queue + * @rq: block IO operation request + * + * Called immediately before block operation request @rq is inserted + * into queue @q. The fields in the operation request @rq struct can + * be examined to determine which device and sectors the pending + * operation would access. + */ DEFINE_EVENT(block_rq, block_rq_insert, TP_PROTO(struct request_queue *q, struct request *rq), @@ -102,6 +142,14 @@ DEFINE_EVENT(block_rq, block_rq_insert, TP_ARGS(q, rq) ); +/** + * block_rq_issue - issue pending block IO request operation to device driver + * @q: queue holding operation + * @rq: block IO operation operation request + * + * Called when block operation request @rq from queue @q is sent to a + * device driver for processing. + */ DEFINE_EVENT(block_rq, block_rq_issue, TP_PROTO(struct request_queue *q, struct request *rq), @@ -109,6 +157,17 @@ DEFINE_EVENT(block_rq, block_rq_issue, TP_ARGS(q, rq) ); +/** + * block_bio_bounce - used bounce buffer when processing block operation + * @q: queue holding the block operation + * @bio: block operation + * + * A bounce buffer was used to handle the block operation @bio in @q. + * This occurs when hardware limitations prevent a direct transfer of + * data between the @bio data memory area and the IO device. Use of a + * bounce buffer requires extra copying of data and decreases + * performance. + */ TRACE_EVENT(block_bio_bounce, TP_PROTO(struct request_queue *q, struct bio *bio), @@ -138,6 +197,14 @@ TRACE_EVENT(block_bio_bounce, __entry->nr_sector, __entry->comm) ); +/** + * block_bio_complete - completed all work on the block operation + * @q: queue holding the block operation + * @bio: block operation completed + * + * This tracepoint indicates there is no further work to do on this + * block IO operation @bio. + */ TRACE_EVENT(block_bio_complete, TP_PROTO(struct request_queue *q, struct bio *bio), @@ -193,6 +260,14 @@ DECLARE_EVENT_CLASS(block_bio, __entry->nr_sector, __entry->comm) ); +/** + * block_bio_backmerge - merging block operation to the end of an existing operation + * @q: queue holding operation + * @bio: new block operation to merge + * + * Merging block request @bio to the end of an existing block request + * in queue @q. + */ DEFINE_EVENT(block_bio, block_bio_backmerge, TP_PROTO(struct request_queue *q, struct bio *bio), @@ -200,6 +275,14 @@ DEFINE_EVENT(block_bio, block_bio_backmerge, TP_ARGS(q, bio) ); +/** + * block_bio_frontmerge - merging block operation to the beginning of an existing operation + * @q: queue holding operation + * @bio: new block operation to merge + * + * Merging block IO operation @bio to the beginning of an existing block + * operation in queue @q. + */ DEFINE_EVENT(block_bio, block_bio_frontmerge, TP_PROTO(struct request_queue *q, struct bio *bio), @@ -207,6 +290,13 @@ DEFINE_EVENT(block_bio, block_bio_frontmerge, TP_ARGS(q, bio) ); +/** + * block_bio_queue - putting new block IO operation in queue + * @q: queue holding operation + * @bio: new block operation + * + * About to place the block IO operation @bio into queue @q. + */ DEFINE_EVENT(block_bio, block_bio_queue, TP_PROTO(struct request_queue *q, struct bio *bio), @@ -243,6 +333,15 @@ DECLARE_EVENT_CLASS(block_get_rq, __entry->nr_sector, __entry->comm) ); +/** + * block_getrq - get a free request entry in queue for block IO operations + * @q: queue for operations + * @bio: pending block IO operation + * @rw: low bit indicates a read (%0) or a write (%1) + * + * A request struct for queue @q has been allocated to handle the + * block IO operation @bio. + */ DEFINE_EVENT(block_get_rq, block_getrq, TP_PROTO(struct request_queue *q, struct bio *bio, int rw), @@ -250,6 +349,17 @@ DEFINE_EVENT(block_get_rq, block_getrq, TP_ARGS(q, bio, rw) ); +/** + * block_sleeprq - waiting to get a free request entry in queue for block IO operation + * @q: queue for operation + * @bio: pending block IO operation + * @rw: low bit indicates a read (%0) or a write (%1) + * + * In the case where a request struct cannot be provided for queue @q + * the process needs to wait for an request struct to become + * available. This tracepoint event is generated each time the + * process goes to sleep waiting for request struct become available. + */ DEFINE_EVENT(block_get_rq, block_sleeprq, TP_PROTO(struct request_queue *q, struct bio *bio, int rw), @@ -257,6 +367,14 @@ DEFINE_EVENT(block_get_rq, block_sleeprq, TP_ARGS(q, bio, rw) ); +/** + * block_plug - keep operations requests in request queue + * @q: request queue to plug + * + * Plug the request queue @q. Do not allow block operation requests + * to be sent to the device driver. Instead, accumulate requests in + * the queue to improve throughput performance of the block device. + */ TRACE_EVENT(block_plug, TP_PROTO(struct request_queue *q), @@ -293,6 +411,13 @@ DECLARE_EVENT_CLASS(block_unplug, TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) ); +/** + * block_unplug_timer - timed release of operations requests in queue to device driver + * @q: request queue to unplug + * + * Unplug the request queue @q because a timer expired and allow block + * operation requests to be sent to the device driver. + */ DEFINE_EVENT(block_unplug, block_unplug_timer, TP_PROTO(struct request_queue *q), @@ -300,6 +425,13 @@ DEFINE_EVENT(block_unplug, block_unplug_timer, TP_ARGS(q) ); +/** + * block_unplug_io - release of operations requests in request queue + * @q: request queue to unplug + * + * Unplug request queue @q because device driver is scheduled to work + * on elements in the request queue. + */ DEFINE_EVENT(block_unplug, block_unplug_io, TP_PROTO(struct request_queue *q), @@ -307,6 +439,17 @@ DEFINE_EVENT(block_unplug, block_unplug_io, TP_ARGS(q) ); +/** + * block_split - split a single bio struct into two bio structs + * @q: queue containing the bio + * @bio: block operation being split + * @new_sector: The starting sector for the new bio + * + * The bio request @bio in request queue @q needs to be split into two + * bio requests. The newly created @bio request starts at + * @new_sector. This split may be required due to hardware limitation + * such as operation crossing device boundaries in a RAID system. + */ TRACE_EVENT(block_split, TP_PROTO(struct request_queue *q, struct bio *bio, @@ -337,6 +480,16 @@ TRACE_EVENT(block_split, __entry->comm) ); +/** + * block_remap - map request for a partition to the raw device + * @q: queue holding the operation + * @bio: revised operation + * @dev: device for the operation + * @from: original sector for the operation + * + * An operation for a partition on a block device has been mapped to the + * raw block device. + */ TRACE_EVENT(block_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, @@ -370,6 +523,17 @@ TRACE_EVENT(block_remap, (unsigned long long)__entry->old_sector) ); +/** + * block_rq_remap - map request for a block operation request + * @q: queue holding the operation + * @rq: block IO operation request + * @dev: device for the operation + * @from: original sector for the operation + * + * The block operation request @rq in @q has been remapped. The block + * operation request @rq holds the current information and @from hold + * the original sector. + */ TRACE_EVENT(block_rq_remap, TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev, From 8a03ae2a5baed3df09e5643615bdd853fc142a09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Jan 2010 20:39:07 +0000 Subject: [PATCH 04/34] block: drbd: Convert semaphore to mutex The bm_change semaphore is semantically a mutex. Convert it to a real mutex. Signed-off-by: Thomas Gleixner Signed-off-by: Philipp Reisner --- drivers/block/drbd/drbd_bitmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index b61057e77882..f58e76581c4b 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -66,7 +66,7 @@ struct drbd_bitmap { size_t bm_words; size_t bm_number_of_pages; sector_t bm_dev_capacity; - struct semaphore bm_change; /* serializes resize operations */ + struct mutex bm_change; /* serializes resize operations */ atomic_t bm_async_io; wait_queue_head_t bm_io_wait; @@ -114,7 +114,7 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why) return; } - trylock_failed = down_trylock(&b->bm_change); + trylock_failed = !mutex_trylock(&b->bm_change); if (trylock_failed) { dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", @@ -125,7 +125,7 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why) b->bm_task == mdev->receiver.task ? "receiver" : b->bm_task == mdev->asender.task ? "asender" : b->bm_task == mdev->worker.task ? "worker" : "?"); - down(&b->bm_change); + mutex_lock(&b->bm_change); } if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); @@ -147,7 +147,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev) b->bm_why = NULL; b->bm_task = NULL; - up(&b->bm_change); + mutex_unlock(&b->bm_change); } /* word offset to long pointer */ @@ -295,7 +295,7 @@ int drbd_bm_init(struct drbd_conf *mdev) if (!b) return -ENOMEM; spin_lock_init(&b->bm_lock); - init_MUTEX(&b->bm_change); + mutex_init(&b->bm_change); init_waitqueue_head(&b->bm_io_wait); mdev->bitmap = b; From cf14c2e987ba0a09a7b09be2ecd55af0bc9c17b4 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 2 Feb 2010 21:03:50 +0100 Subject: [PATCH 05/34] drbd: --dry-run option for drbdsetup net ( drbdadm -- --dry-run connect ) Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 8 +++++++- drivers/block/drbd/drbd_main.c | 16 ++++++++++++++-- drivers/block/drbd/drbd_receiver.c | 22 ++++++++++++++++++++-- include/linux/drbd.h | 2 +- include/linux/drbd_nl.h | 1 + 5 files changed, 43 insertions(+), 6 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 2bf3a6ef3684..1aae724e37fb 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -443,13 +443,18 @@ struct p_rs_param_89 { char csums_alg[SHARED_SECRET_MAX]; } __packed; +enum drbd_conn_flags { + CF_WANT_LOSE = 1, + CF_DRY_RUN = 2, +}; + struct p_protocol { struct p_header head; u32 protocol; u32 after_sb_0p; u32 after_sb_1p; u32 after_sb_2p; - u32 want_lose; + u32 conn_flags; u32 two_primaries; /* Since protocol version 87 and higher. */ @@ -791,6 +796,7 @@ enum { * while this is set. */ RESIZE_PENDING, /* Size change detected locally, waiting for the response from * the peer, if it changed there as well. */ + CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ }; struct drbd_bitmap; /* opaque for drbd_conf */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index ab871e00ffc5..b2d347d18c7d 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1668,7 +1668,7 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) int drbd_send_protocol(struct drbd_conf *mdev) { struct p_protocol *p; - int size, rv; + int size, cf, rv; size = sizeof(struct p_protocol); @@ -1685,9 +1685,21 @@ int drbd_send_protocol(struct drbd_conf *mdev) p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); - p->want_lose = cpu_to_be32(mdev->net_conf->want_lose); p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); + cf = 0; + if (mdev->net_conf->want_lose) + cf |= CF_WANT_LOSE; + if (mdev->net_conf->dry_run) { + if (mdev->agreed_pro_version >= 92) + cf |= CF_DRY_RUN; + else { + dev_err(DEV, "--dry-run is not supported by peer"); + return 0; + } + } + p->conn_flags = cpu_to_be32(cf); + if (mdev->agreed_pro_version >= 87) strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index d065c646b35a..8bcde4a9632b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2538,6 +2538,16 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol } } + if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) { + if (hg == 0) + dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n"); + else + dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.", + drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET), + abs(hg) >= 2 ? "full" : "bit-map based"); + return C_MASK; + } + if (abs(hg) >= 2) { dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake")) @@ -2585,7 +2595,7 @@ static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) struct p_protocol *p = (struct p_protocol *)h; int header_size, data_size; int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; - int p_want_lose, p_two_primaries; + int p_want_lose, p_two_primaries, cf; char p_integrity_alg[SHARED_SECRET_MAX] = ""; header_size = sizeof(*p) - sizeof(*h); @@ -2598,8 +2608,14 @@ static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) p_after_sb_0p = be32_to_cpu(p->after_sb_0p); p_after_sb_1p = be32_to_cpu(p->after_sb_1p); p_after_sb_2p = be32_to_cpu(p->after_sb_2p); - p_want_lose = be32_to_cpu(p->want_lose); p_two_primaries = be32_to_cpu(p->two_primaries); + cf = be32_to_cpu(p->conn_flags); + p_want_lose = cf & CF_WANT_LOSE; + + clear_bit(CONN_DRY_RUN, &mdev->flags); + + if (cf & CF_DRY_RUN) + set_bit(CONN_DRY_RUN, &mdev->flags); if (p_proto != mdev->net_conf->wire_protocol) { dev_err(DEV, "incompatible communication protocols\n"); @@ -3125,6 +3141,8 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); peer_state.disk = D_DISKLESS; } else { + if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) + return FALSE; D_ASSERT(oconn == C_WF_REPORT_PARAMS); drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); return FALSE; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 78962272338a..4341b1a97a34 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -56,7 +56,7 @@ extern const char *drbd_buildtag(void); #define REL_VERSION "8.3.7" #define API_VERSION 88 #define PRO_VERSION_MIN 86 -#define PRO_VERSION_MAX 91 +#define PRO_VERSION_MAX 92 enum drbd_io_error_p { diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index a4d82f895994..b41050e8cc2f 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -63,6 +63,7 @@ NL_PACKET(net_conf, 5, NL_BIT( 41, T_MAY_IGNORE, always_asbp) NL_BIT( 61, T_MAY_IGNORE, no_cork) NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) + NL_BIT( 70, T_MANDATORY, dry_run) ) NL_PACKET(disconnect, 6, ) From 4aa83b7bf122106669346eef40632289f540653f Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 26 Feb 2010 16:53:24 +0100 Subject: [PATCH 06/34] drbd: fix NULL pointer dereference on 4k hard sect size we still don't support 4k 'physical' sectors 'natively', but use a read-modify-write workaround. And we even tried to use the extra page before we allocated it :( Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 38 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 4df3b40b1057..d53d36cd0e57 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -941,6 +941,25 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp drbd_md_set_sector_offsets(mdev, nbc); + /* allocate a second IO page if logical_block_size != 512 */ + logical_block_size = bdev_logical_block_size(nbc->md_bdev); + if (logical_block_size == 0) + logical_block_size = MD_SECTOR_SIZE; + + if (logical_block_size != MD_SECTOR_SIZE) { + if (!mdev->md_io_tmpp) { + struct page *page = alloc_page(GFP_NOIO); + if (!page) + goto force_diskless_dec; + + dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", + logical_block_size, MD_SECTOR_SIZE); + dev_warn(DEV, "Workaround engaged (has performance impact).\n"); + + mdev->md_io_tmpp = page; + } + } + if (!mdev->bitmap) { if (drbd_bm_init(mdev)) { retcode = ERR_NOMEM; @@ -980,25 +999,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp goto force_diskless_dec; } - /* allocate a second IO page if logical_block_size != 512 */ - logical_block_size = bdev_logical_block_size(nbc->md_bdev); - if (logical_block_size == 0) - logical_block_size = MD_SECTOR_SIZE; - - if (logical_block_size != MD_SECTOR_SIZE) { - if (!mdev->md_io_tmpp) { - struct page *page = alloc_page(GFP_NOIO); - if (!page) - goto force_diskless_dec; - - dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", - logical_block_size, MD_SECTOR_SIZE); - dev_warn(DEV, "Workaround engaged (has performance impact).\n"); - - mdev->md_io_tmpp = page; - } - } - /* Reset the "barriers don't work" bits here, then force meta data to * be written, to ensure we determine if barriers are supported. */ if (nbc->dc.no_md_flush) From 580b9767dbdf2c049c4d05330c70ea786ef01016 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 26 Feb 2010 23:15:23 +0100 Subject: [PATCH 07/34] drbd: fix broken state change after split-brain attach while connected Situation: we have diverging data sets, i.e. we had a split brain somewhen, but currently are connected, one node diskless. Then we try to attach that disk, figure it is consistent, but has a diverging data set, we refuse to attach. This led to strange state changes: 22:18:35 bb drbd1: peer( Unknown -> Primary ) conn( WFReportParams -> Connected) pdsk( DUnknown -> UpToDate ) 22:19:30 bb drbd1: disk( Diskless -> Attaching ) 22:19:30 bb drbd1: disk( Attaching -> Negotiating ) 22:19:30 bb drbd1: drbd_sync_handshake: 22:19:30 bb drbd1: self 97BF25798B9D5222:F33D1F62ADE698DD:4269796F9D027C83:AC45D8B5C3C1BF93 bits:19449 flags:0 22:19:30 bb drbd1: peer 280DFB6E125465D3:F33D1F62ADE698DC:4269796F9D027C82:AC45D8B5C3C1BF93 bits:2575806 flags:0 22:19:30 bb drbd1: uuid_compare()=100 by rule 90 22:19:30 bb drbd1: Split-Brain detected, dropping connection! 22:19:30 bb drbd1: disk( Negotiating -> Diskless ) while the other side says: 22:19:30 aa drbd1: Split-Brain detected, dropping connection! 22:19:30 aa drbd1: Disk attach process on the peer node was aborted. 22:19:30 aa drbd1: conn( Connected -> TOO_LARGE ) pdsk( Diskless -> Consistent ) This should be fixed now. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 8bcde4a9632b..41f36a9cd407 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2513,6 +2513,10 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol } if (hg == -100) { + /* FIXME this log message is not correct if we end up here + * after an attempted attach on a diskless node. + * We just refuse to attach -- well, we drop the "connection" + * to that disk, in a way... */ dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); drbd_khelper(mdev, "split-brain"); return C_MASK; @@ -3134,12 +3138,13 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h) put_ldev(mdev); if (nconn == C_MASK) { + nconn = C_CONNECTED; if (mdev->state.disk == D_NEGOTIATING) { drbd_force_state(mdev, NS(disk, D_DISKLESS)); - nconn = C_CONNECTED; } else if (peer_state.disk == D_NEGOTIATING) { dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); peer_state.disk = D_DISKLESS; + real_peer_disk = D_DISKLESS; } else { if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) return FALSE; From 676396d545350a70d922605ec23c2ed26124334a Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 3 Mar 2010 02:08:22 +0100 Subject: [PATCH 08/34] fix unit of rs_same_csums accounting Depending on resync request size, we need to account for more than one bit. Impact: cosmetic If SyncTarget reported correctly 100% equal checksums, the SyncSource usually reported 12% equal checksums instead, because it only counted requests, we typically do 32k resync requests, and the bitmap granularity is still 4k. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_worker.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index b453c2bca3be..d97a811ad0d2 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -938,7 +938,8 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) if (eq) { drbd_set_in_sync(mdev, e->sector, e->size); - mdev->rs_same_csum++; + /* rs_same_csums unit is BM_BLOCK_SIZE */ + mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT; ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); } else { inc_rs_pending(mdev); From 4589d7f829951c1713ef5a4ad1a9bb563da329b5 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 3 Mar 2010 02:25:33 +0100 Subject: [PATCH 09/34] drbd_disconnect: grab meta.socket mutex as well Fixes a race and potential kernel panic if e.g. the worker was just about to send a few P_RS_IS_IN_SYNC via the meta socket for checksum based resync, while the receiver destroys the sockets in drbd_disconnect. To make sure no-one is using the meta socket, it is not enough to stop the asender... Grab the meta socket mutex before destroying it. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 4 ++++ drivers/block/drbd/drbd_receiver.c | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index b2d347d18c7d..67e0fc542249 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3173,14 +3173,18 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) void drbd_free_sock(struct drbd_conf *mdev) { if (mdev->data.socket) { + mutex_lock(&mdev->data.mutex); kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); sock_release(mdev->data.socket); mdev->data.socket = NULL; + mutex_unlock(&mdev->data.mutex); } if (mdev->meta.socket) { + mutex_lock(&mdev->meta.mutex); kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); sock_release(mdev->meta.socket); mdev->meta.socket = NULL; + mutex_unlock(&mdev->meta.mutex); } } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 41f36a9cd407..d803e6c257e2 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3617,10 +3617,7 @@ static void drbd_disconnect(struct drbd_conf *mdev) /* asender does not clean up anything. it must not interfere, either */ drbd_thread_stop(&mdev->asender); - - mutex_lock(&mdev->data.mutex); drbd_free_sock(mdev); - mutex_unlock(&mdev->data.mutex); spin_lock_irq(&mdev->req_lock); _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); From c42b6cf4b38c9726d4b46c48d04197c9ca74d773 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 3 Mar 2010 02:44:11 +0100 Subject: [PATCH 10/34] drbd: add missing drbd command names to avoid in error messages cmdname() should map command number to its human readable representation. The string table was incomplete, though. Maybe rather do a switch() block, and let the compiler help us to keep it complete? Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 1aae724e37fb..844206c31851 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -261,6 +261,9 @@ static inline const char *cmdname(enum drbd_packets cmd) [P_OV_REQUEST] = "OVRequest", [P_OV_REPLY] = "OVReply", [P_OV_RESULT] = "OVResult", + [P_CSUM_RS_REQUEST] = "CsumRSRequest", + [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", + [P_COMPRESSED_BITMAP] = "CBitmap", [P_MAX_CMD] = NULL, }; From 309d1608cce32903d67d47e7545e232c400b6aa0 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 2 Mar 2010 15:03:44 +0100 Subject: [PATCH 11/34] drbd: Reduce the time an empty resync takes usually This mitigates changes introduced with commit: http://git.drbd.org/?p=drbd-8.3.git;a=commit;h=4b6803a3276652da3737 Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 1 + drivers/block/drbd/drbd_receiver.c | 2 ++ drivers/block/drbd/drbd_worker.c | 12 +++++++++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 844206c31851..2d5cebbbf253 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -800,6 +800,7 @@ enum { RESIZE_PENDING, /* Size change detected locally, waiting for the response from * the peer, if it changed there as well. */ CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ + GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ }; struct drbd_bitmap; /* opaque for drbd_conf */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index d803e6c257e2..ed9f1de24a71 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -4074,6 +4074,8 @@ static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) { /* restore idle timeout */ mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags)) + wake_up(&mdev->misc_wait); return TRUE; } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index d97a811ad0d2..4672f2f37b51 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1289,6 +1289,14 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na) return retcode; } +static void ping_peer(struct drbd_conf *mdev) +{ + clear_bit(GOT_PING_ACK, &mdev->flags); + request_ping(mdev); + wait_event(mdev->misc_wait, + test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); +} + /** * drbd_start_resync() - Start the resync process * @mdev: DRBD device. @@ -1383,9 +1391,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) if (mdev->rs_total == 0) { /* Peer still reachable? Beware of failing before-resync-target handlers! */ - request_ping(mdev); - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */ + ping_peer(mdev); drbd_resync_finished(mdev); return; } From d0c3f60f3611ceac9b1e4fdffd1497337568e7cb Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 2 Mar 2010 15:06:45 +0100 Subject: [PATCH 12/34] drbd: Make sure we do not send state updates during an empty resync [Bugz 271] This is a race condition that existed for ages. The previous commit reduces the window, this one closes it. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_worker.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 4672f2f37b51..44bf6d11197e 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1380,7 +1380,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) _drbd_pause_after(mdev); } write_unlock_irq(&global_state_lock); - drbd_state_unlock(mdev); put_ldev(mdev); if (r == SS_SUCCESS) { @@ -1393,7 +1392,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) /* Peer still reachable? Beware of failing before-resync-target handlers! */ ping_peer(mdev); drbd_resync_finished(mdev); - return; } /* ns.conn may already be != mdev->state.conn, @@ -1405,6 +1403,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) drbd_md_sync(mdev); } + drbd_state_unlock(mdev); } int drbd_worker(struct drbd_thread *thi) From d10a33c68b8526d95ef6ee72b371c392d48df4d3 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 4 Mar 2010 15:11:39 +0100 Subject: [PATCH 13/34] drbd: Forcing primary should also work for Consistent disks [Bugz 266] Up to now this only worked for Outdated and Inconsistent disks, that it did not worked for Consistent disks was an inconsistent omission. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index d53d36cd0e57..6492e321ec00 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -285,8 +285,8 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) } if (r == SS_NO_UP_TO_DATE_DISK && force && - (mdev->state.disk == D_INCONSISTENT || - mdev->state.disk == D_OUTDATED)) { + (mdev->state.disk < D_UP_TO_DATE && + mdev->state.disk >= D_INCONSISTENT)) { mask.disk = D_MASK; val.disk = D_UP_TO_DATE; forced = 1; From 1f55243024087b56aef0b1e6d9c0ea89c76f0a6b Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 4 Mar 2010 15:51:01 +0100 Subject: [PATCH 14/34] drbd: Renamed overwrite_peer to primary_force Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 2 +- include/linux/drbd_nl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6492e321ec00..6429d2b19e06 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -407,7 +407,7 @@ static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, } reply->ret_code = - drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer); + drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force); return 0; } diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index b41050e8cc2f..f7431a4ca608 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -12,7 +12,7 @@ #endif NL_PACKET(primary, 1, - NL_BIT( 1, T_MAY_IGNORE, overwrite_peer) + NL_BIT( 1, T_MAY_IGNORE, primary_force) ) NL_PACKET(secondary, 2, ) From 39ad2bbb5900d1bc9ae8f06cebb4cb2529d9e42e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 4 Mar 2010 15:52:30 +0100 Subject: [PATCH 15/34] drbd: fix al-to-on-disk-bitmap for 4k logical_block_size Up to now, applying the in-core activity-log to the on-disk bitmap did not care for logical_block_size. On logical_block_size != 512 byte, this very likely results in misalligned block access and spurious "io errors". We now simply always submit aligned whole 4k blocks, fixing this for logical block sizes of 512, 1024, 2048 and 4096. For even larger logical block sizes, this won't work. But I'm not aware of devices with such properties being available. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 17956ff6a08d..43e57f395fd6 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -536,7 +536,9 @@ static void atodb_endio(struct bio *bio, int error) put_ldev(mdev); } +/* sector to word */ #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) + /* activity log to on disk bitmap -- prepare bio unless that sector * is already covered by previously prepared bios */ static int atodb_prepare_unless_covered(struct drbd_conf *mdev, @@ -546,13 +548,20 @@ static int atodb_prepare_unless_covered(struct drbd_conf *mdev, { struct bio *bio; struct page *page; - sector_t on_disk_sector = enr + mdev->ldev->md.md_offset - + mdev->ldev->md.bm_offset; + sector_t on_disk_sector; unsigned int page_offset = PAGE_SIZE; int offset; int i = 0; int err = -ENOMEM; + /* We always write aligned, full 4k blocks, + * so we can ignore the logical_block_size (for now) */ + enr &= ~7U; + on_disk_sector = enr + mdev->ldev->md.md_offset + + mdev->ldev->md.bm_offset; + + D_ASSERT(!(on_disk_sector & 7U)); + /* Check if that enr is already covered by an already created bio. * Caution, bios[] is not NULL terminated, * but only initialized to all NULL. @@ -588,7 +597,7 @@ static int atodb_prepare_unless_covered(struct drbd_conf *mdev, offset = S2W(enr); drbd_bm_get_lel(mdev, offset, - min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset), + min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset), kmap(page) + page_offset); kunmap(page); @@ -597,7 +606,7 @@ static int atodb_prepare_unless_covered(struct drbd_conf *mdev, bio->bi_bdev = mdev->ldev->md_bdev; bio->bi_sector = on_disk_sector; - if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE) + if (bio_add_page(bio, page, 4096, page_offset) != 4096) goto out_put_page; atomic_inc(&wc->count); From c12ec0a2d94001003dfb929ce14c287fca0522b0 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 11 Mar 2010 14:09:47 -0800 Subject: [PATCH 16/34] paride: fix off-by-one test With `while (j++ < PX_SPIN)' j reaches PX_SPIN + 1 after the loop. This is probably unlikely to produce a problem. Signed-off-by: Roel Kluin Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/paride/pcd.c | 4 ++-- drivers/block/paride/pf.c | 4 ++-- drivers/block/paride/pt.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 8866ca369d5e..71acf4e53356 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -341,11 +341,11 @@ static int pcd_wait(struct pcd_unit *cd, int go, int stop, char *fun, char *msg) && (j++ < PCD_SPIN)) udelay(PCD_DELAY); - if ((r & (IDE_ERR & stop)) || (j >= PCD_SPIN)) { + if ((r & (IDE_ERR & stop)) || (j > PCD_SPIN)) { s = read_reg(cd, 7); e = read_reg(cd, 1); p = read_reg(cd, 2); - if (j >= PCD_SPIN) + if (j > PCD_SPIN) e |= 0x100; if (fun) printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index ddb4f9abd480..c059aab3006b 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -391,11 +391,11 @@ static int pf_wait(struct pf_unit *pf, int go, int stop, char *fun, char *msg) && (j++ < PF_SPIN)) udelay(PF_SPIN_DEL); - if ((r & (STAT_ERR & stop)) || (j >= PF_SPIN)) { + if ((r & (STAT_ERR & stop)) || (j > PF_SPIN)) { s = read_reg(pf, 7); e = read_reg(pf, 1); p = read_reg(pf, 2); - if (j >= PF_SPIN) + if (j > PF_SPIN) e |= 0x100; if (fun) printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c index 1e4006e18f03..bc5825fdeaab 100644 --- a/drivers/block/paride/pt.c +++ b/drivers/block/paride/pt.c @@ -274,11 +274,11 @@ static int pt_wait(struct pt_unit *tape, int go, int stop, char *fun, char *msg) && (j++ < PT_SPIN)) udelay(PT_SPIN_DEL); - if ((r & (STAT_ERR & stop)) || (j >= PT_SPIN)) { + if ((r & (STAT_ERR & stop)) || (j > PT_SPIN)) { s = read_reg(pi, 7); e = read_reg(pi, 1); p = read_reg(pi, 2); - if (j >= PT_SPIN) + if (j > PT_SPIN) e |= 0x100; if (fun) printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" From f11c9c5c259cb2c3d698548dc3936f773ab1f5b9 Mon Sep 17 00:00:00 2001 From: Edward Shishkin Date: Thu, 11 Mar 2010 14:09:47 -0800 Subject: [PATCH 17/34] vfs: improve writeback_inodes_wb() Do not pin/unpin superblock for every inode in writeback_inodes_wb(), pin it for the whole group of inodes which belong to the same superblock and call writeback_sb_inodes() handler for them. Signed-off-by: Edward Shishkin Cc: Jens Axboe Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 133 +++++++++++++++++++++----------------- include/linux/writeback.h | 3 + 2 files changed, 76 insertions(+), 60 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76fc4d594acb..6841effa47ca 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -553,108 +553,85 @@ select_queue: return ret; } -static void unpin_sb_for_writeback(struct super_block **psb) +static void unpin_sb_for_writeback(struct super_block *sb) { - struct super_block *sb = *psb; - - if (sb) { - up_read(&sb->s_umount); - put_super(sb); - *psb = NULL; - } + up_read(&sb->s_umount); + put_super(sb); } +enum sb_pin_state { + SB_PINNED, + SB_NOT_PINNED, + SB_PIN_FAILED +}; + /* * For WB_SYNC_NONE writeback, the caller does not have the sb pinned * before calling writeback. So make sure that we do pin it, so it doesn't * go away while we are writing inodes from it. - * - * Returns 0 if the super was successfully pinned (or pinning wasn't needed), - * 1 if we failed. */ -static int pin_sb_for_writeback(struct writeback_control *wbc, - struct inode *inode, struct super_block **psb) +static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, + struct super_block *sb) { - struct super_block *sb = inode->i_sb; - - /* - * If this sb is already pinned, nothing more to do. If not and - * *psb is non-NULL, unpin the old one first - */ - if (sb == *psb) - return 0; - else if (*psb) - unpin_sb_for_writeback(psb); - /* * Caller must already hold the ref for this */ if (wbc->sync_mode == WB_SYNC_ALL) { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - return 0; + return SB_NOT_PINNED; } - spin_lock(&sb_lock); sb->s_count++; if (down_read_trylock(&sb->s_umount)) { if (sb->s_root) { spin_unlock(&sb_lock); - goto pinned; + return SB_PINNED; } /* * umounted, drop rwsem again and fall through to failure */ up_read(&sb->s_umount); } - sb->s_count--; spin_unlock(&sb_lock); - return 1; -pinned: - *psb = sb; - return 0; + return SB_PIN_FAILED; } -static void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +/* + * Write a portion of b_io inodes which belong to @sb. + * If @wbc->sb != NULL, then find and write all such + * inodes. Otherwise write only ones which go sequentially + * in reverse order. + * Return 1, if the caller writeback routine should be + * interrupted. Otherwise return 0. + */ +static int writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct writeback_control *wbc) { - struct super_block *sb = wbc->sb, *pin_sb = NULL; - const unsigned long start = jiffies; /* livelock avoidance */ - - spin_lock(&inode_lock); - - if (!wbc->for_kupdate || list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); - while (!list_empty(&wb->b_io)) { - struct inode *inode = list_entry(wb->b_io.prev, - struct inode, i_list); long pages_skipped; - - /* - * super block given and doesn't match, skip this inode - */ - if (sb && sb != inode->i_sb) { + struct inode *inode = list_entry(wb->b_io.prev, + struct inode, i_list); + if (wbc->sb && sb != inode->i_sb) { + /* super block given and doesn't + match, skip this inode */ redirty_tail(inode); continue; } - + if (sb != inode->i_sb) + /* finish with this superblock */ + return 0; if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; } - /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. */ - if (inode_dirtied_after(inode, start)) - break; - - if (pin_sb_for_writeback(wbc, inode, &pin_sb)) { - requeue_io(inode); - continue; - } + if (inode_dirtied_after(inode, wbc->wb_start)) + return 1; BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); __iget(inode); @@ -673,14 +650,50 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, spin_lock(&inode_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; - break; + return 1; } if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } + /* b_io is empty */ + return 1; +} - unpin_sb_for_writeback(&pin_sb); +static void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + int ret = 0; + wbc->wb_start = jiffies; /* livelock avoidance */ + spin_lock(&inode_lock); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + + while (!list_empty(&wb->b_io)) { + struct inode *inode = list_entry(wb->b_io.prev, + struct inode, i_list); + struct super_block *sb = inode->i_sb; + enum sb_pin_state state; + + if (wbc->sb && sb != wbc->sb) { + /* super block given and doesn't + match, skip this inode */ + redirty_tail(inode); + continue; + } + state = pin_sb_for_writeback(wbc, sb); + + if (state == SB_PIN_FAILED) { + requeue_io(inode); + continue; + } + ret = writeback_sb_inodes(sb, wb, wbc); + + if (state == SB_PINNED) + unpin_sb_for_writeback(sb); + if (ret) + break; + } spin_unlock(&inode_lock); /* Leave any unwritten inodes on b_io */ } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 76e8903cd204..36520ded3e06 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -34,6 +34,9 @@ struct writeback_control { enum writeback_sync_modes sync_mode; unsigned long *older_than_this; /* If !NULL, only write back inodes older than this */ + unsigned long wb_start; /* Time writeback_inodes_wb was + called. This is needed to avoid + extra jobs and livelock */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ From 2cda2728aa1c8c006418a24f867b25e5eb7a32e2 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 15 Mar 2010 12:46:51 +0100 Subject: [PATCH 18/34] block: Fix overrun in lcm() and move it to lib lcm() was defined to take integer-sized arguments. The supplied arguments are multiplied, however, causing us to overflow given sufficiently large input. That in turn led to incorrect optimal I/O size reporting in some cases (RAID over RAID). Switch lcm() over to unsigned long similar to gcd() and move the function from blk-settings.c to lib. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 11 +---------- include/linux/lcm.h | 8 ++++++++ lib/Makefile | 2 +- lib/lcm.c | 15 +++++++++++++++ 4 files changed, 25 insertions(+), 11 deletions(-) create mode 100644 include/linux/lcm.h create mode 100644 lib/lcm.c diff --git a/block/blk-settings.c b/block/blk-settings.c index 31e7a9375c13..4c4700dca56a 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -8,6 +8,7 @@ #include #include /* for max_pfn/max_low_pfn */ #include +#include #include #include "blk.h" @@ -461,16 +462,6 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) } EXPORT_SYMBOL(blk_queue_stack_limits); -static unsigned int lcm(unsigned int a, unsigned int b) -{ - if (a && b) - return (a * b) / gcd(a, b); - else if (b) - return b; - - return a; -} - /** * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) diff --git a/include/linux/lcm.h b/include/linux/lcm.h new file mode 100644 index 000000000000..7bf01d779b45 --- /dev/null +++ b/include/linux/lcm.h @@ -0,0 +1,8 @@ +#ifndef _LCM_H +#define _LCM_H + +#include + +unsigned long lcm(unsigned long a, unsigned long b) __attribute_const__; + +#endif /* _LCM_H */ diff --git a/lib/Makefile b/lib/Makefile index 2e152aed7198..0d4015205c64 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -21,7 +21,7 @@ lib-y += kobject.o kref.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ - string_helpers.o gcd.o list_sort.o + string_helpers.o gcd.o lcm.o list_sort.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG diff --git a/lib/lcm.c b/lib/lcm.c new file mode 100644 index 000000000000..157cd88a6ffc --- /dev/null +++ b/lib/lcm.c @@ -0,0 +1,15 @@ +#include +#include +#include + +/* Lowest common multiple */ +unsigned long lcm(unsigned long a, unsigned long b) +{ + if (a && b) + return (a * b) / gcd(a, b); + else if (b) + return b; + + return a; +} +EXPORT_SYMBOL_GPL(lcm); From ee714f2dd33e726346e34f5cda12543162f4753e Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 10 Mar 2010 00:48:32 -0500 Subject: [PATCH 19/34] block: Finalize conversion of block limits functions Remove compatibility wrappers and update remaining drivers. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/block/DAC960.c | 1 - drivers/block/virtio_blk.c | 5 ++--- include/linux/blkdev.h | 24 ------------------------ 3 files changed, 2 insertions(+), 28 deletions(-) diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 459f1bc25a7b..c5f22bb0a48e 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -2533,7 +2533,6 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller) Controller->RequestQueue[n] = RequestQueue; blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit); RequestQueue->queuedata = Controller; - blk_queue_max_hw_segments(RequestQueue, Controller->DriverScatterGatherLimit); blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit); blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand); disk->queue = RequestQueue; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 3c64af05fa82..653817ceeedd 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -347,14 +347,13 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) set_capacity(vblk->disk, cap); /* We can handle whatever the host told us to handle. */ - blk_queue_max_phys_segments(q, vblk->sg_elems-2); - blk_queue_max_hw_segments(q, vblk->sg_elems-2); + blk_queue_max_segments(q, vblk->sg_elems-2); /* No need to bounce any requests */ blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); /* No real sector limit. */ - blk_queue_max_sectors(q, -1U); + blk_queue_max_hw_sectors(q, -1U); /* Host can optionally specify maximum segment size and number of * segments. */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ebd22dbed861..41551c9341b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -921,26 +921,7 @@ extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); - -/* Temporary compatibility wrapper */ -static inline void blk_queue_max_sectors(struct request_queue *q, unsigned int max) -{ - blk_queue_max_hw_sectors(q, max); -} - extern void blk_queue_max_segments(struct request_queue *, unsigned short); - -static inline void blk_queue_max_phys_segments(struct request_queue *q, unsigned short max) -{ - blk_queue_max_segments(q, max); -} - -static inline void blk_queue_max_hw_segments(struct request_queue *q, unsigned short max) -{ - blk_queue_max_segments(q, max); -} - - extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); @@ -1030,11 +1011,6 @@ static inline int sb_issue_discard(struct super_block *sb, extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); -#define MAX_PHYS_SEGMENTS 128 -#define MAX_HW_SEGMENTS 128 -#define SAFE_MAX_SECTORS 255 -#define MAX_SEGMENT_SIZE 65536 - enum blk_default_limits { BLK_MAX_SEGMENTS = 128, BLK_SAFE_MAX_SECTORS = 255, From c77a5710b7e23847bfdb81fcaa10b585f65c960a Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 10 Mar 2010 00:48:33 -0500 Subject: [PATCH 20/34] block: Export max number of segments and max segment size in sysfs These two values are useful when debugging issues surrounding maximum I/O size. Put them in sysfs with the rest of the queue limits. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e85442415db3..fad86550255a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -106,6 +106,19 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) return queue_var_show(max_sectors_kb, (page)); } +static ssize_t queue_max_segments_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_segments(q), (page)); +} + +static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) +{ + if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) + return queue_var_show(queue_max_segment_size(q), (page)); + + return queue_var_show(PAGE_CACHE_SIZE, (page)); +} + static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) { return queue_var_show(queue_logical_block_size(q), page); @@ -280,6 +293,16 @@ static struct queue_sysfs_entry queue_max_hw_sectors_entry = { .show = queue_max_hw_sectors_show, }; +static struct queue_sysfs_entry queue_max_segments_entry = { + .attr = {.name = "max_segments", .mode = S_IRUGO }, + .show = queue_max_segments_show, +}; + +static struct queue_sysfs_entry queue_max_segment_size_entry = { + .attr = {.name = "max_segment_size", .mode = S_IRUGO }, + .show = queue_max_segment_size_show, +}; + static struct queue_sysfs_entry queue_iosched_entry = { .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, .show = elv_iosched_show, @@ -355,6 +378,8 @@ static struct attribute *default_attrs[] = { &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, + &queue_max_segments_entry.attr, + &queue_max_segment_size_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, From 97fedbbe1046b3118f49df249840ca21041eefe4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 16 Mar 2010 08:55:32 +0100 Subject: [PATCH 21/34] Remove GENHD_FL_DRIVERFS This flag is not used, so best discarded. Signed-off-by: NeilBrown -- Hi Jens, I came across this recently - these are the only two occurances of "GENHD_FL_DRIVERFS" in the kernel, so it cannot be needed. NeilBrown Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 2 +- include/linux/genhd.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 1dd4d8407694..466fae8ef770 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2185,7 +2185,7 @@ static void sd_probe_async(void *data, async_cookie_t cookie) blk_queue_prep_rq(sdp->request_queue, sd_prep_fn); gd->driverfs_dev = &sdp->sdev_gendev; - gd->flags = GENHD_FL_EXT_DEVT | GENHD_FL_DRIVERFS; + gd->flags = GENHD_FL_EXT_DEVT; if (sdp->removable) gd->flags |= GENHD_FL_REMOVABLE; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 56b50514ab25..5f2f4c4d8fb0 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -109,7 +109,7 @@ struct hd_struct { }; #define GENHD_FL_REMOVABLE 1 -#define GENHD_FL_DRIVERFS 2 +/* 2 is unused */ #define GENHD_FL_MEDIA_CHANGE_NOTIFY 4 #define GENHD_FL_CD 8 #define GENHD_FL_UP 16 From 910ac735bad53ce54741a72a5b19ab69794ae069 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 16 Mar 2010 08:57:15 +0100 Subject: [PATCH 22/34] block: make CONFIG_BLK_CGROUP visible Make the config visible, so we can choose from CONFIG_BLK_CGROUP=y and CONFIG_BLK_CGROUP=m when CONFIG_IOSCHED_CFQ=m. Signed-off-by: Li Zefan Signed-off-by: Jens Axboe --- block/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/Kconfig b/block/Kconfig index e20fbde0875c..f9e89f4d94bb 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -78,8 +78,9 @@ config BLK_DEV_INTEGRITY Protection. If in doubt, say N. config BLK_CGROUP - bool + tristate "Block cgroup support" depends on CGROUPS + depends on CFQ_GROUP_IOSCHED default n ---help--- Generic block IO controller cgroup interface. This is the common From e9ce335df51ff782035a15c261a3c0c9892a1767 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 19 Mar 2010 08:03:04 +0100 Subject: [PATCH 23/34] cfq-iosched: fix a kbuild regression Alex Shi reported a kbuild regression which is about 10% performance lost. He bisected to this commit: 3dde36ddea3e07dd025c4c1ba47edec91606fec0. The reason is cfqq_close() can't find close cooperator. Restoring cfq_rq_close()'s threshold to original value makes the regression go away. Since for_preempt parameter isn't used anymore, this patch deletes it. Reported-by: Alex Shi Signed-off-by: Shaohua Li Acked-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index dee9d9378fee..8d5a2f2f7fb9 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -47,6 +47,7 @@ static const int cfq_hist_divisor = 4; #define CFQ_SERVICE_SHIFT 12 #define CFQQ_SEEK_THR (sector_t)(8 * 100) +#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) @@ -1660,9 +1661,9 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, } static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *rq, bool for_preempt) + struct request *rq) { - return cfq_dist_from_last(cfqd, rq) <= CFQQ_SEEK_THR; + return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; } static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, @@ -1689,7 +1690,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, * will contain the closest sector. */ __cfqq = rb_entry(parent, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) return __cfqq; if (blk_rq_pos(__cfqq->next_rq) < sector) @@ -1700,7 +1701,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, return NULL; __cfqq = rb_entry(node, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) return __cfqq; return NULL; @@ -3103,7 +3104,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if this request is as-good as one we would expect from the * current cfqq, let it preempt */ - if (cfq_rq_close(cfqd, cfqq, rq, true)) + if (cfq_rq_close(cfqd, cfqq, rq)) return true; return false; From 181fdde3b4268cb7b4af76ba6337e7ec8accbb36 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Fri, 19 Mar 2010 08:58:16 +0100 Subject: [PATCH 24/34] block: remove 16 bytes of padding from struct request on 64bits Remove alignment padding to shrink struct request from 336 to 320 bytes so needing one fewer cacheline and therefore removing 48 bytes from struct request_queue. Signed-off-by: Richard Kennedy Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 41551c9341b6..6690e8bae7bb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -158,7 +158,6 @@ enum rq_flag_bits { struct request { struct list_head queuelist; struct call_single_data csd; - int cpu; struct request_queue *q; @@ -166,9 +165,11 @@ struct request { enum rq_cmd_type_bits cmd_type; unsigned long atomic_flags; + int cpu; + /* the following two fields are internal, NEVER access directly */ - sector_t __sector; /* sector cursor */ unsigned int __data_len; /* total data len */ + sector_t __sector; /* sector cursor */ struct bio *bio; struct bio *biotail; @@ -201,20 +202,20 @@ struct request { unsigned short ioprio; + int ref_count; + void *special; /* opaque pointer available for LLD use */ char *buffer; /* kaddr of the current segment if available */ int tag; int errors; - int ref_count; - /* * when request is used as a packet command carrier */ - unsigned short cmd_len; unsigned char __cmd[BLK_MAX_CDB]; unsigned char *cmd; + unsigned short cmd_len; unsigned int extra_len; /* length of alignment and padding */ unsigned int sense_len; From 8a6d9b149f105f8bdfa8e42dd9753e45a1887a16 Mon Sep 17 00:00:00 2001 From: Ferenc Wagner Date: Wed, 24 Mar 2010 08:20:03 +0100 Subject: [PATCH 25/34] i2o: Remove the dangerous kobj_to_i2o_device macro This macro worked only when applied to variables named 'kobj'. While this could have been fixed by simply renaming the macro argument, a more type-safe replacement by an inline function would be preferred. However, nobody uses this macro, so it's simpler to just remove it. Signed-off-by: Ferenc Wagner Signed-off-by: Jens Axboe --- include/linux/i2o.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/i2o.h b/include/linux/i2o.h index 87018dc5527d..9e7a12d6385d 100644 --- a/include/linux/i2o.h +++ b/include/linux/i2o.h @@ -782,7 +782,6 @@ extern int i2o_exec_lct_get(struct i2o_controller *); #define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver) #define to_i2o_device(dev) container_of(dev, struct i2o_device, device) #define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device) -#define kobj_to_i2o_device(kobj) to_i2o_device(container_of(kobj, struct device, kobj)) /** * i2o_out_to_virt - Turn an I2O message to a virtual address From b1ffe737f5b743115ee46ffb59e338e580c54902 Mon Sep 17 00:00:00 2001 From: Divyesh Shah Date: Thu, 25 Mar 2010 15:45:03 +0100 Subject: [PATCH 26/34] cfq-iosched: Add additional blktrace log messages in CFQ for easier debugging These have helped us debug some issues we've noticed in earlier IO controller versions and should be useful now as well. The extra logging covers: - idling behavior. Since there are so many conditions based on which we decide to idle or not, this patch adds a log message for some conditions that we've found useful. - workload slices and current prio and workload type Changelog from v1: o moved log message from cfq_set_active_queue() to __cfq_set_active_queue() o changed queue_count to st->count Signed-off-by: Divyesh Shah Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 8d5a2f2f7fb9..2f91c5351949 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1518,7 +1518,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "set_active"); + cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", + cfqd->serving_prio, cfqd->serving_type); cfqq->slice_start = 0; cfqq->dispatch_start = jiffies; cfqq->allocated_slice = 0; @@ -1788,7 +1789,11 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - return service_tree->count == 1 && cfq_cfqq_sync(cfqq); + if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) + return 1; + cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", + service_tree->count); + return 0; } static void cfq_arm_slice_timer(struct cfq_data *cfqd) @@ -1833,8 +1838,11 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * time slice. */ if (sample_valid(cic->ttime_samples) && - (cfqq->slice_end - jiffies < cic->ttime_mean)) + (cfqq->slice_end - jiffies < cic->ttime_mean)) { + cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", + cic->ttime_mean); return; + } cfq_mark_cfqq_wait_request(cfqq); @@ -2042,6 +2050,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) slice = max(slice, 2 * cfqd->cfq_slice_idle); slice = max_t(unsigned, slice, CFQ_MIN_TT); + cfq_log(cfqd, "workload slice:%d", slice); cfqd->workload_expires = jiffies + slice; cfqd->noidle_tree_requires_idle = false; } @@ -3308,6 +3317,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) if (cfq_should_wait_busy(cfqd, cfqq)) { cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; cfq_mark_cfqq_wait_busy(cfqq); + cfq_log_cfqq(cfqd, cfqq, "will busy wait"); } /* From 39c01b219fd30c74869b6fc8749f7900f04e9ef6 Mon Sep 17 00:00:00 2001 From: Divyesh Shah Date: Thu, 25 Mar 2010 15:45:57 +0100 Subject: [PATCH 27/34] cfq-iosched: Do not merge queues of BE and IDLE classes Even if they are found to be co-operating. The prio_trees do not have any IDLE cfqqs on them. cfq_close_cooperator() is called from cfq_select_queue() and cfq_completed_request(). The latter ensures that the close cooperator code does not get invoked if the current cfqq is of class IDLE but the former doesn't seem to have any such checks. So an IDLE cfqq may get merged with a BE cfqq from the same group which should be avoided. Signed-off-by: Divyesh Shah Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 2f91c5351949..2c7a0f4f3cd7 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1723,6 +1723,8 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, { struct cfq_queue *cfqq; + if (cfq_class_idle(cur_cfqq)) + return NULL; if (!cfq_cfqq_sync(cur_cfqq)) return NULL; if (CFQQ_SEEKY(cur_cfqq)) From 61917bdaaf6bea4b885525cf63f65272914f6be2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 2 Apr 2010 08:39:40 +0200 Subject: [PATCH 28/34] cciss: unlock on error path We take the spin_lock again in fail_all_cmds() so we need to unlock here. Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 9e3af307aae1..eb5ff0531cfb 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -3341,6 +3341,7 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id) printk(KERN_WARNING "cciss: controller cciss%d failed, stopping.\n", h->ctlr); + spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags); fail_all_cmds(h->ctlr); return IRQ_HANDLED; } From b2b163dd47024e445410b72d0c5df6d819c14dfd Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 2 Apr 2010 08:40:33 +0200 Subject: [PATCH 29/34] drbd: lc_element_by_index() never returns NULL Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 43e57f395fd6..df018990c422 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -1336,7 +1336,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev) /* ok, ->resync is there. */ for (i = 0; i < mdev->resync->nr_elements; i++) { e = lc_element_by_index(mdev->resync, i); - bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + bm_ext = lc_entry(e, struct bm_extent, lce); if (bm_ext->lce.lc_number == LC_FREE) continue; if (bm_ext->lce.lc_number == mdev->resync_wenr) { From a506aedc51093544ff0f9610af6066d18cb6abbe Mon Sep 17 00:00:00 2001 From: "wzt.wzt@gmail.com" Date: Fri, 2 Apr 2010 08:41:14 +0200 Subject: [PATCH 30/34] Block: Fix block/elevator.c elevator_get() off-by-one error elevator_get() not check the name length, if the name length > sizeof(elv), elv will miss the '\0'. And elv buffer will be replace "-iosched" as something like aaaaaaaaa, then call request_module() can load an not trust module. Signed-off-by: Zhitong Wang Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/elevator.c b/block/elevator.c index df75676f6671..76e3702d5381 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -154,7 +154,7 @@ static struct elevator_type *elevator_get(const char *name) spin_unlock(&elv_list_lock); - sprintf(elv, "%s-iosched", name); + snprintf(elv, sizeof(elv), "%s-iosched", name); request_module("%s", elv); spin_lock(&elv_list_lock); From 144214537370b4f133a735446ebe86e90cfb2501 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 2 Apr 2010 09:46:55 +0200 Subject: [PATCH 31/34] backing-dev: Handle class_create() failure I hit this when we had a bug in IDR for a few days. Basically sysfs would fail to create new inodes since it uses an IDR and therefore class_create would fail. While we are unlikely to see this fail we may as well handle it instead of oopsing. Signed-off-by: Anton Blanchard Signed-off-by: Jens Axboe --- mm/backing-dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0e8ca0347707..f13e067e1467 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -227,6 +227,9 @@ static struct device_attribute bdi_dev_attrs[] = { static __init int bdi_class_init(void) { bdi_class = class_create(THIS_MODULE, "bdi"); + if (IS_ERR(bdi_class)) + return PTR_ERR(bdi_class); + bdi_class->dev_attrs = bdi_dev_attrs; bdi_debug_init(); return 0; From a74b2adae06265b8cfa335d7d40d4a5abd11e977 Mon Sep 17 00:00:00 2001 From: Ricky Benitez Date: Mon, 5 Apr 2010 18:22:17 +0200 Subject: [PATCH 32/34] block: expose the statistics in blkio.time and blkio.sectors for the root cgroup Currently, the io statistics for the root cgroup are maintained, but they are not shown because the device information is not available at the point that the root blkio cgroup is created. This patch updates the device information when the statistics are updated so that the statistics become visible. Signed-off-by: Ricky Benitez Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 2c7a0f4f3cd7..7104ac816fb6 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -948,6 +948,11 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) unsigned int major, minor; cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfqg->blkg.dev = MKDEV(major, minor); + goto done; + } if (cfqg || !create) goto done; From 02246c41171097ceab3246f6dc251ac89de6004b Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Thu, 8 Apr 2010 21:39:31 +0200 Subject: [PATCH 33/34] loop: Update mtime when writing using aops Update mtime when writing to backing filesystem using the address space operations write_begin and write_end. Signed-off-by: Nikanth Karthikesan Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bd112c8c7bcd..1c21a3f23868 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -238,6 +238,8 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, if (ret) goto fail; + file_update_time(file); + transfer_result = lo_do_transfer(lo, WRITE, page, offset, bvec->bv_page, bv_offs, size, IV); copied = size; From 3440c49f5c5ecb4f29b0544aa87da71888404f8f Mon Sep 17 00:00:00 2001 From: Divyesh Shah Date: Fri, 9 Apr 2010 09:29:57 +0200 Subject: [PATCH 34/34] cfq-iosched: Fix the incorrect timeslice accounting with forced_dispatch When CFQ dispatches requests forcefully due to a barrier or changing iosched, it runs through all cfqq's dispatching requests and then expires each queue. However, it does not activate a cfqq before flushing its IOs resulting in using stale values for computing slice_used. This patch fixes it by calling activate queue before flushing reuqests from each queue. This is useful mostly for barrier requests because when the iosched is changing it really doesnt matter if we have incorrect accounting since we're going to break down all structures anyway. We also now expire the current timeslice before moving on with the dispatch to accurately account slice used for that cfqq. Signed-off-by: Divyesh Shah Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7104ac816fb6..b773000f8a06 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2205,10 +2205,13 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) struct cfq_queue *cfqq; int dispatched = 0; - while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) - dispatched += __cfq_forced_dispatch_cfqq(cfqq); - + /* Expire the timeslice of the current active queue first */ cfq_slice_expired(cfqd, 0); + while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { + __cfq_set_active_queue(cfqd, cfqq); + dispatched += __cfq_forced_dispatch_cfqq(cfqq); + } + BUG_ON(cfqd->busy_queues); cfq_log(cfqd, "forced_dispatch=%d", dispatched);