From 6050fa4c84cc93ae509f5105f585a429dffc5633 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 24 Nov 2021 19:47:40 +0900 Subject: [PATCH 01/32] loop: don't hold lo_mutex during __loop_clr_fd() syzbot is reporting circular locking problem at __loop_clr_fd() [1], for commit 87579e9b7d8dc36e ("loop: use worker per cgroup instead of kworker") is calling destroy_workqueue() with lo->lo_mutex held. Since all functions where lo->lo_state matters are already checking lo->lo_state with lo->lo_mutex held (in order to avoid racing with e.g. ioctl(LOOP_CTL_REMOVE)), and __loop_clr_fd() can be called from either ioctl(LOOP_CLR_FD) xor close(), lo->lo_state == Lo_rundown is considered as an exclusive lock for __loop_clr_fd(). Therefore, hold lo->lo_mutex inside __loop_clr_fd() only when asserting/updating lo->lo_state. Since ioctl(LOOP_CLR_FD) depends on lo->lo_state == Lo_bound, a valid lo->lo_backing_file must have been assigned by ioctl(LOOP_SET_FD) or ioctl(LOOP_CONFIGURE). Thus, we can remove lo->lo_backing_file test, and convert __loop_clr_fd() into a void function. Link: https://syzkaller.appspot.com/bug?extid=63614029dfb79abd4383 [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/8ebe3b2e-8975-7f26-0620-7144a3b8b8cd@i-love.sakura.ne.jp Signed-off-by: Jens Axboe --- drivers/block/loop.c | 55 ++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 0954ea8cf9e3..ba76319b5544 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1082,13 +1082,10 @@ out_putf: return error; } -static int __loop_clr_fd(struct loop_device *lo, bool release) +static void __loop_clr_fd(struct loop_device *lo, bool release) { - struct file *filp = NULL; + struct file *filp; gfp_t gfp = lo->old_gfp_mask; - int err = 0; - bool partscan = false; - int lo_number; struct loop_worker *pos, *worker; /* @@ -1103,17 +1100,14 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) * became visible. */ + /* + * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close() + * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if + * lo->lo_state has changed while waiting for lo->lo_mutex. + */ mutex_lock(&lo->lo_mutex); - if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) { - err = -ENXIO; - goto out_unlock; - } - - filp = lo->lo_backing_file; - if (filp == NULL) { - err = -EINVAL; - goto out_unlock; - } + BUG_ON(lo->lo_state != Lo_rundown); + mutex_unlock(&lo->lo_mutex); if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) blk_queue_write_cache(lo->lo_queue, false, false); @@ -1134,6 +1128,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) del_timer_sync(&lo->timer); spin_lock_irq(&lo->lo_lock); + filp = lo->lo_backing_file; lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); @@ -1153,12 +1148,11 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) module_put(THIS_MODULE); blk_mq_unfreeze_queue(lo->lo_queue); - partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; - lo_number = lo->lo_number; disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); -out_unlock: - mutex_unlock(&lo->lo_mutex); - if (partscan) { + + if (lo->lo_flags & LO_FLAGS_PARTSCAN) { + int err; + /* * open_mutex has been held already in release path, so don't * acquire it if this function is called in such case. @@ -1174,24 +1168,20 @@ out_unlock: mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", - __func__, lo_number, err); + __func__, lo->lo_number, err); /* Device is gone, no point in returning error */ - err = 0; } /* * lo->lo_state is set to Lo_unbound here after above partscan has - * finished. - * - * There cannot be anybody else entering __loop_clr_fd() as - * lo->lo_backing_file is already cleared and Lo_rundown state - * protects us from all the other places trying to change the 'lo' - * device. + * finished. There cannot be anybody else entering __loop_clr_fd() as + * Lo_rundown state protects us from all the other places trying to + * change the 'lo' device. */ - mutex_lock(&lo->lo_mutex); lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART; + mutex_lock(&lo->lo_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); @@ -1200,9 +1190,7 @@ out_unlock: * lo_mutex triggers a circular lock dependency possibility warning as * fput can take open_mutex which is usually taken before lo_mutex. */ - if (filp) - fput(filp); - return err; + fput(filp); } static int loop_clr_fd(struct loop_device *lo) @@ -1234,7 +1222,8 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_state = Lo_rundown; mutex_unlock(&lo->lo_mutex); - return __loop_clr_fd(lo, false); + __loop_clr_fd(lo, false); + return 0; } static int From 2bfdbe8b7ebd17b5331071071a910fbabc64b436 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 3 Dec 2021 10:39:35 +0800 Subject: [PATCH 02/32] null_blk: allow zero poll queues There isn't any reason to not allow zero poll queues from user viewpoint. Also sometimes we need to compare io poll between poll mode and irq mode, so not allowing poll queues is bad. Fixes: 15dfc662ef31 ("null_blk: Fix handling of submit_queues and poll_queues attributes") Cc: Shin'ichiro Kawasaki Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20211203023935.3424042-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 54f7d490f8eb..b4ff5ae1f70c 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -340,9 +340,9 @@ static int nullb_update_nr_hw_queues(struct nullb_device *dev, return 0; /* - * Make sure at least one queue exists for each of submit and poll. + * Make sure at least one submit queue exists. */ - if (!submit_queues || !poll_queues) + if (!submit_queues) return -EINVAL; /* @@ -1917,8 +1917,6 @@ static int null_validate_conf(struct nullb_device *dev) if (dev->poll_queues > g_poll_queues) dev->poll_queues = g_poll_queues; - else if (dev->poll_queues == 0) - dev->poll_queues = 1; dev->prev_poll_queues = dev->poll_queues; dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); From fb48febce7e30baed94dd791e19521abd2c3fd83 Mon Sep 17 00:00:00 2001 From: Tasos Sahanidis Date: Fri, 3 Sep 2021 09:47:58 +0300 Subject: [PATCH 03/32] floppy: Fix hang in watchdog when disk is ejected When the watchdog detects a disk change, it calls cancel_activity(), which in turn tries to cancel the fd_timer delayed work. In the above scenario, fd_timer_fn is set to fd_watchdog(), meaning it is trying to cancel its own work. This results in a hang as cancel_delayed_work_sync() is waiting for the watchdog (itself) to return, which never happens. This can be reproduced relatively consistently by attempting to read a broken floppy, and ejecting it while IO is being attempted and retried. To resolve this, this patch calls cancel_delayed_work() instead, which cancels the work without waiting for the watchdog to return and finish. Before this regression was introduced, the code in this section used del_timer(), and not del_timer_sync() to delete the watchdog timer. Link: https://lore.kernel.org/r/399e486c-6540-db27-76aa-7a271b061f76@tasossah.com Fixes: 070ad7e793dc ("floppy: convert to delayed work and single-thread wq") Signed-off-by: Tasos Sahanidis Signed-off-by: Denis Efremov Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 0c638de25023..f0e36c18f349 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -1015,7 +1015,7 @@ static DECLARE_DELAYED_WORK(fd_timer, fd_timer_workfn); static void cancel_activity(void) { do_floppy = NULL; - cancel_delayed_work_sync(&fd_timer); + cancel_delayed_work(&fd_timer); cancel_work_sync(&floppy_work); } From 545a32498c536ee152331cd2e7d2416aa0f20e01 Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Tue, 16 Nov 2021 21:10:33 +0800 Subject: [PATCH 04/32] floppy: Add max size check for user space request We need to check the max request size that is from user space before allocating pages. If the request size exceeds the limit, return -EINVAL. This check can avoid the warning below from page allocator. WARNING: CPU: 3 PID: 16525 at mm/page_alloc.c:5344 current_gfp_context include/linux/sched/mm.h:195 [inline] WARNING: CPU: 3 PID: 16525 at mm/page_alloc.c:5344 __alloc_pages+0x45d/0x500 mm/page_alloc.c:5356 Modules linked in: CPU: 3 PID: 16525 Comm: syz-executor.3 Not tainted 5.15.0-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 RIP: 0010:__alloc_pages+0x45d/0x500 mm/page_alloc.c:5344 Code: be c9 00 00 00 48 c7 c7 20 4a 97 89 c6 05 62 32 a7 0b 01 e8 74 9a 42 07 e9 6a ff ff ff 0f 0b e9 a0 fd ff ff 40 80 e5 3f eb 88 <0f> 0b e9 18 ff ff ff 4c 89 ef 44 89 e6 45 31 ed e8 1e 76 ff ff e9 RSP: 0018:ffffc90023b87850 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 1ffff92004770f0b RCX: dffffc0000000000 RDX: 0000000000000000 RSI: 0000000000000033 RDI: 0000000000010cc1 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 R10: ffffffff81bb4686 R11: 0000000000000001 R12: ffffffff902c1960 R13: 0000000000000033 R14: 0000000000000000 R15: ffff88804cf64a30 FS: 0000000000000000(0000) GS:ffff88802cd00000(0063) knlGS:00000000f44b4b40 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 000000002c921000 CR3: 000000004f507000 CR4: 0000000000150ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: alloc_pages+0x1a7/0x300 mm/mempolicy.c:2191 __get_free_pages+0x8/0x40 mm/page_alloc.c:5418 raw_cmd_copyin drivers/block/floppy.c:3113 [inline] raw_cmd_ioctl drivers/block/floppy.c:3160 [inline] fd_locked_ioctl+0x12e5/0x2820 drivers/block/floppy.c:3528 fd_ioctl drivers/block/floppy.c:3555 [inline] fd_compat_ioctl+0x891/0x1b60 drivers/block/floppy.c:3869 compat_blkdev_ioctl+0x3b8/0x810 block/ioctl.c:662 __do_compat_sys_ioctl+0x1c7/0x290 fs/ioctl.c:972 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline] __do_fast_syscall_32+0x65/0xf0 arch/x86/entry/common.c:178 do_fast_syscall_32+0x2f/0x70 arch/x86/entry/common.c:203 entry_SYSENTER_compat_after_hwframe+0x4d/0x5c Reported-by: syzbot+23a02c7df2cf2bc93fa2@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20211116131033.27685-1-sxwjean@me.com Signed-off-by: Xiongwei Song Signed-off-by: Denis Efremov Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index f0e36c18f349..e611411a934c 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3081,6 +3081,8 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr) } } +#define MAX_LEN (1UL << MAX_ORDER << PAGE_SHIFT) + static int raw_cmd_copyin(int cmd, void __user *param, struct floppy_raw_cmd **rcmd) { @@ -3108,7 +3110,7 @@ loop: ptr->resultcode = 0; if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) { - if (ptr->length <= 0) + if (ptr->length <= 0 || ptr->length >= MAX_LEN) return -EINVAL; ptr->kernel_data = (char *)fd_dma_mem_alloc(ptr->length); fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length); From 2385ebf38f94d4f7761b1e9a4973d04753da02c2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 3 Dec 2021 16:17:03 +0800 Subject: [PATCH 05/32] block: null_blk: batched complete poll requests Complete poll requests via blk_mq_add_to_batch() and blk_mq_end_request_batch(), so that we can cover batched complete code path by running null_blk test. Meantime this way shows ~14% IOPS boost on 't/io_uring /dev/nullb0' in my test. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20211203081703.3506020-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index b4ff5ae1f70c..20534a2daf17 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1574,7 +1574,9 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) cmd = blk_mq_rq_to_pdu(req); cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req), blk_rq_sectors(req)); - end_cmd(cmd); + if (!blk_mq_add_to_batch(req, iob, cmd->error, + blk_mq_end_request_batch)) + end_cmd(cmd); nr++; } From db67097aa6f2587b44055f2e16db72a11e17faef Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 Dec 2021 21:31:56 -0700 Subject: [PATCH 06/32] pktdvd: stop using bdi congestion framework. The bdi congestion framework isn't widely used and should be deprecated. pktdvd makes use of it to track congestion, but this can be done entirely internally to pktdvd, so it doesn't need to use the framework. So introduce a "congested" flag. When waiting for bio_queue_size to drop, set this flag and a var_waitqueue() to wait for it. When bio_queue_size does drop and this flag is set, clear the flag and call wake_up_var(). We don't use a wait_var_event macro for the waiting as we need to set the flag and drop the spinlock before calling schedule() and while that is possible with __wait_var_event(), result is not easy to read. Reviewed-by: Christoph Hellwig Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/163910843527.9928.857338663717630212@noble.neil.brown.name Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 31 ++++++++++++++++++++----------- include/linux/pktcdvd.h | 2 ++ 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 887c98d61684..713b7dcf39f9 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1107,7 +1107,6 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) sector_t zone = 0; /* Suppress gcc warning */ struct pkt_rb_node *node, *first_node; struct rb_node *n; - int wakeup; atomic_set(&pd->scan_queue, 0); @@ -1179,12 +1178,14 @@ try_next_bio: spin_unlock(&pkt->lock); } /* check write congestion marks, and if bio_queue_size is - below, wake up any waiters */ - wakeup = (pd->write_congestion_on > 0 - && pd->bio_queue_size <= pd->write_congestion_off); + * below, wake up any waiters + */ + if (pd->congested && + pd->bio_queue_size <= pd->write_congestion_off) { + pd->congested = false; + wake_up_var(&pd->congested); + } spin_unlock(&pd->lock); - if (wakeup) - clear_bdi_congested(pd->disk->bdi, BLK_RW_ASYNC); pkt->sleep_time = max(PACKET_WAIT_TIME, 1); pkt_set_state(pkt, PACKET_WAITING_STATE); @@ -2356,7 +2357,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) } spin_unlock(&pd->cdrw.active_list_lock); - /* + /* * Test if there is enough room left in the bio work queue * (queue size >= congestion on mark). * If not, wait till the work queue size is below the congestion off mark. @@ -2364,12 +2365,20 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) spin_lock(&pd->lock); if (pd->write_congestion_on > 0 && pd->bio_queue_size >= pd->write_congestion_on) { - set_bdi_congested(bio->bi_bdev->bd_disk->bdi, BLK_RW_ASYNC); - do { + struct wait_bit_queue_entry wqe; + + init_wait_var_entry(&wqe, &pd->congested, 0); + for (;;) { + prepare_to_wait_event(__var_waitqueue(&pd->congested), + &wqe.wq_entry, + TASK_UNINTERRUPTIBLE); + if (pd->bio_queue_size <= pd->write_congestion_off) + break; + pd->congested = true; spin_unlock(&pd->lock); - congestion_wait(BLK_RW_ASYNC, HZ); + schedule(); spin_lock(&pd->lock); - } while(pd->bio_queue_size > pd->write_congestion_off); + } } spin_unlock(&pd->lock); diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index 174601554b06..c391e694aa26 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -183,6 +183,8 @@ struct pktcdvd_device spinlock_t lock; /* Serialize access to bio_queue */ struct rb_root bio_queue; /* Work queue of bios we need to handle */ int bio_queue_size; /* Number of nodes in bio_queue */ + bool congested; /* Someone is waiting for bio_queue_size + * to drop. */ sector_t current_sector; /* Keep track of where the elevator is */ atomic_t scan_queue; /* Set to non-zero when pkt_handle_queue */ /* needs to be run. */ From c5eafd790e1317f5bfc69845207f69f6447b4b2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 10 Dec 2021 16:32:44 -0700 Subject: [PATCH 07/32] null_blk: cast command status to integer kernel test robot reports that sparse now triggers a warning on null_blk: >> drivers/block/null_blk/main.c:1577:55: sparse: sparse: incorrect type in argument 3 (different base types) @@ expected int ioerror @@ got restricted blk_status_t [usertype] error @@ drivers/block/null_blk/main.c:1577:55: sparse: expected int ioerror drivers/block/null_blk/main.c:1577:55: sparse: got restricted blk_status_t [usertype] error because blk_mq_add_to_batch() takes an integer instead of a blk_status_t. Just cast this to an integer to silence it, null_blk is the odd one out here since the command status is the "right" type. If we change the function type, then we'll have do that for other callers too (existing and future ones). Fixes: 2385ebf38f94 ("block: null_blk: batched complete poll requests") Reported-by: kernel test robot Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 20534a2daf17..6be6ccd4a28f 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1574,7 +1574,7 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) cmd = blk_mq_rq_to_pdu(req); cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req), blk_rq_sectors(req)); - if (!blk_mq_add_to_batch(req, iob, cmd->error, + if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error, blk_mq_end_request_batch)) end_cmd(cmd); nr++; From 322c4293ecc58110227b49d7e47ae37b9b03566f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 13 Dec 2021 21:55:27 +0900 Subject: [PATCH 08/32] loop: make autoclear operation asynchronous syzbot is reporting circular locking problem at __loop_clr_fd() [1], for commit 87579e9b7d8dc36e ("loop: use worker per cgroup instead of kworker") is calling destroy_workqueue() with disk->open_mutex held. This circular dependency cannot be broken unless we call __loop_clr_fd() without holding disk->open_mutex. Therefore, defer __loop_clr_fd() from lo_release() to a WQ context. Link: https://syzkaller.appspot.com/bug?extid=643e4ce4b6ad1347d372 [1] Reported-by: syzbot Suggested-by: Christoph Hellwig Cc: Jan Kara Tested-by: syzbot+643e4ce4b6ad1347d372@syzkaller.appspotmail.com Signed-off-by: Tetsuo Handa Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/1ed7df28-ebd6-71fb-70e5-1c2972e05ddb@i-love.sakura.ne.jp Signed-off-by: Jens Axboe --- drivers/block/loop.c | 65 ++++++++++++++++++++++++-------------------- drivers/block/loop.h | 1 + 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ba76319b5544..7f4ea06534c2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1082,7 +1082,7 @@ out_putf: return error; } -static void __loop_clr_fd(struct loop_device *lo, bool release) +static void __loop_clr_fd(struct loop_device *lo) { struct file *filp; gfp_t gfp = lo->old_gfp_mask; @@ -1144,8 +1144,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) /* let user-space know about this change */ kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); mapping_set_gfp_mask(filp->f_mapping, gfp); - /* This is safe: open() is still holding a reference. */ - module_put(THIS_MODULE); blk_mq_unfreeze_queue(lo->lo_queue); disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); @@ -1153,44 +1151,52 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) if (lo->lo_flags & LO_FLAGS_PARTSCAN) { int err; - /* - * open_mutex has been held already in release path, so don't - * acquire it if this function is called in such case. - * - * If the reread partition isn't from release path, lo_refcnt - * must be at least one and it can only become zero when the - * current holder is released. - */ - if (!release) - mutex_lock(&lo->lo_disk->open_mutex); + mutex_lock(&lo->lo_disk->open_mutex); err = bdev_disk_changed(lo->lo_disk, false); - if (!release) - mutex_unlock(&lo->lo_disk->open_mutex); + mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo->lo_number, err); /* Device is gone, no point in returning error */ } - /* - * lo->lo_state is set to Lo_unbound here after above partscan has - * finished. There cannot be anybody else entering __loop_clr_fd() as - * Lo_rundown state protects us from all the other places trying to - * change the 'lo' device. - */ lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART; + + fput(filp); +} + +static void loop_rundown_completed(struct loop_device *lo) +{ mutex_lock(&lo->lo_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); + module_put(THIS_MODULE); +} - /* - * Need not hold lo_mutex to fput backing file. Calling fput holding - * lo_mutex triggers a circular lock dependency possibility warning as - * fput can take open_mutex which is usually taken before lo_mutex. - */ - fput(filp); +static void loop_rundown_workfn(struct work_struct *work) +{ + struct loop_device *lo = container_of(work, struct loop_device, + rundown_work); + struct block_device *bdev = lo->lo_device; + struct gendisk *disk = lo->lo_disk; + + __loop_clr_fd(lo); + kobject_put(&bdev->bd_device.kobj); + module_put(disk->fops->owner); + loop_rundown_completed(lo); +} + +static void loop_schedule_rundown(struct loop_device *lo) +{ + struct block_device *bdev = lo->lo_device; + struct gendisk *disk = lo->lo_disk; + + __module_get(disk->fops->owner); + kobject_get(&bdev->bd_device.kobj); + INIT_WORK(&lo->rundown_work, loop_rundown_workfn); + queue_work(system_long_wq, &lo->rundown_work); } static int loop_clr_fd(struct loop_device *lo) @@ -1222,7 +1228,8 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_state = Lo_rundown; mutex_unlock(&lo->lo_mutex); - __loop_clr_fd(lo, false); + __loop_clr_fd(lo); + loop_rundown_completed(lo); return 0; } @@ -1747,7 +1754,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode) * In autoclear mode, stop the loop thread * and remove configuration after last close. */ - __loop_clr_fd(lo, true); + loop_schedule_rundown(lo); return; } else if (lo->lo_state == Lo_bound) { /* diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 082d4b6bfc6a..918a7a2dc025 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -56,6 +56,7 @@ struct loop_device { struct gendisk *lo_disk; struct mutex lo_mutex; bool idr_visible; + struct work_struct rundown_work; }; struct loop_cmd { From 52a0cab35c568f896067641d8e07f798341954f5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Nov 2021 12:37:12 -0800 Subject: [PATCH 09/32] drbd: Use struct_group() to zero algs In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memset(), avoid intentionally writing across neighboring fields. Add a struct_group() for the algs so that memset() can correctly reason about the size. Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20211118203712.1288866-1-keescook@chromium.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 3 ++- drivers/block/drbd/drbd_protocol.h | 6 ++++-- drivers/block/drbd/drbd_receiver.c | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 07b3c6093e7d..6f450816c4fa 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -729,7 +729,8 @@ int drbd_send_sync_param(struct drbd_peer_device *peer_device) cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; /* initialize verify_alg and csums_alg */ - memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX); + memset(&p->algs, 0, sizeof(p->algs)); if (get_ldev(peer_device->device)) { dc = rcu_dereference(peer_device->device->ldev->disk_conf); diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index dea59c92ecc1..a882b65ab5d2 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -283,8 +283,10 @@ struct p_rs_param_89 { struct p_rs_param_95 { u32 resync_rate; - char verify_alg[SHARED_SECRET_MAX]; - char csums_alg[SHARED_SECRET_MAX]; + struct_group(algs, + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; + ); u32 c_plan_ahead; u32 c_delay_target; u32 c_fill_target; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 1f740e42e457..6df2539e215b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3921,7 +3921,8 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i /* initialize verify_alg and csums_alg */ p = pi->data; - memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX); + memset(&p->algs, 0, sizeof(p->algs)); err = drbd_recv_all(peer_device->connection, p, header_size); if (err) From 2920417c98dbe4b58200c12fc9dc152834b76e42 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 8 Dec 2021 13:24:46 -0600 Subject: [PATCH 10/32] mtip32xx: remove pointless drvdata checking The .suspend() and .resume() methods are only called after the .probe() method (mtip_pci_probe()) has set the drvdata and returned success. Therefore, if we get to mtip_pci_suspend() or mtip_pci_resume(), the drvdata must be valid. Drop the unnecessary checking. Signed-off-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211208192449.146076-2-helgaas@kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 30f471021a40..715ef350df54 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4150,12 +4150,6 @@ static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) int rv = 0; struct driver_data *dd = pci_get_drvdata(pdev); - if (!dd) { - dev_err(&pdev->dev, - "Driver private datastructure is NULL\n"); - return -EFAULT; - } - set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); /* Disable ports & interrupts then send standby immediate */ @@ -4189,14 +4183,7 @@ static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) static int mtip_pci_resume(struct pci_dev *pdev) { int rv = 0; - struct driver_data *dd; - - dd = pci_get_drvdata(pdev); - if (!dd) { - dev_err(&pdev->dev, - "Driver private datastructure is NULL\n"); - return -EFAULT; - } + struct driver_data *dd = pci_get_drvdata(pdev); /* Move the device to active State */ pci_set_power_state(pdev, PCI_D0); From 9e541f142dab67264075baaf8fd2eb4423742c16 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 8 Dec 2021 13:24:47 -0600 Subject: [PATCH 11/32] mtip32xx: remove pointless drvdata lookups Previously we passed a struct pci_dev * to mtip_check_surprise_removal(), which immediately looked up the driver_data. But all callers already have the driver_data pointer, so just pass it directly and skip the extra lookup. No functional change intended. Signed-off-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211208192449.146076-3-helgaas@kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 715ef350df54..4c918c93e2fc 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -136,16 +136,15 @@ struct mtip_compat_ide_task_request_s { * return value * true if device removed, else false */ -static bool mtip_check_surprise_removal(struct pci_dev *pdev) +static bool mtip_check_surprise_removal(struct driver_data *dd) { u16 vendor_id = 0; - struct driver_data *dd = pci_get_drvdata(pdev); if (dd->sr) return true; /* Read the vendorID from the configuration space */ - pci_read_config_word(pdev, 0x00, &vendor_id); + pci_read_config_word(dd->pdev, 0x00, &vendor_id); if (vendor_id == 0xFFFF) { dd->sr = true; if (dd->queue) @@ -447,7 +446,7 @@ static int mtip_device_reset(struct driver_data *dd) { int rv = 0; - if (mtip_check_surprise_removal(dd->pdev)) + if (mtip_check_surprise_removal(dd)) return 0; if (mtip_hba_reset(dd) < 0) @@ -727,7 +726,7 @@ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) dev_warn(&dd->pdev->dev, "Port stat errors %x unhandled\n", (port_stat & ~PORT_IRQ_HANDLED)); - if (mtip_check_surprise_removal(dd->pdev)) + if (mtip_check_surprise_removal(dd)) return; } if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) { @@ -752,7 +751,7 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) /* Acknowledge the interrupt status on the port.*/ port_stat = readl(port->mmio + PORT_IRQ_STAT); if (unlikely(port_stat == 0xFFFFFFFF)) { - mtip_check_surprise_removal(dd->pdev); + mtip_check_surprise_removal(dd); return IRQ_HANDLED; } writel(port_stat, port->mmio + PORT_IRQ_STAT); @@ -796,7 +795,7 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) } if (unlikely(port_stat & PORT_IRQ_ERR)) { - if (unlikely(mtip_check_surprise_removal(dd->pdev))) { + if (unlikely(mtip_check_surprise_removal(dd))) { /* don't proceed further */ return IRQ_HANDLED; } @@ -915,7 +914,7 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) msleep(100); - if (mtip_check_surprise_removal(port->dd->pdev)) + if (mtip_check_surprise_removal(port->dd)) goto err_fault; active = mtip_commands_active(port); @@ -980,7 +979,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, return -EFAULT; } - if (mtip_check_surprise_removal(dd->pdev)) + if (mtip_check_surprise_removal(dd)) return -EFAULT; rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED); @@ -1022,7 +1021,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, fis->command, int_cmd->status); rv = -EIO; - if (mtip_check_surprise_removal(dd->pdev) || + if (mtip_check_surprise_removal(dd) || test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { dev_err(&dd->pdev->dev, @@ -2513,7 +2512,7 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd) if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) return -EFAULT; - if (mtip_check_surprise_removal(dd->pdev)) + if (mtip_check_surprise_removal(dd)) return -EFAULT; if (mtip_get_identify(dd->port, NULL) < 0) @@ -2891,7 +2890,7 @@ static int mtip_hw_init(struct driver_data *dd) time_before(jiffies, timeout)) { mdelay(100); } - if (unlikely(mtip_check_surprise_removal(dd->pdev))) { + if (unlikely(mtip_check_surprise_removal(dd))) { timetaken = jiffies - timetaken; dev_warn(&dd->pdev->dev, "Surprise removal detected at %u ms\n", @@ -4098,7 +4097,7 @@ static void mtip_pci_remove(struct pci_dev *pdev) list_add(&dd->remove_list, &removing_list); spin_unlock_irqrestore(&dev_lock, flags); - mtip_check_surprise_removal(pdev); + mtip_check_surprise_removal(dd); synchronize_irq(dd->pdev->irq); /* Spin until workers are done */ From cd97b7e0d78009b45e08b92441d9562f9f37968c Mon Sep 17 00:00:00 2001 From: Vaibhav Gupta Date: Wed, 8 Dec 2021 13:24:48 -0600 Subject: [PATCH 12/32] mtip32xx: convert to generic power management Convert mtip32xx from legacy PCI power management to the generic power management framework. Previously, mtip32xx used legacy PCI power management, where mtip_pci_suspend() and mtip_pci_resume() were responsible for both device-specific things and generic PCI things: mtip_pci_suspend mtip_block_suspend(dd) <-- device-specific pci_save_state(pdev) <-- generic PCI pci_set_power_state(pdev, pci_choose_state(pdev, state)) mtip_pci_resume pci_set_power_state(PCI_D0) <-- generic PCI pci_restore_state(pdev) <-- generic PCI pcim_enable_device(pdev) <-- generic PCI pci_set_master(pdev) <-- generic PCI mtip_block_resume(dd) <-- device-specific With generic power management, the PCI bus PM methods do the generic PCI things, and the driver needs only the device-specific part, i.e., suspend_devices_and_enter dpm_suspend_start(PMSG_SUSPEND) pci_pm_suspend # PCI bus .suspend() method mtip_pci_suspend # dev->driver->pm->suspend mtip_block_suspend <-- device-specific suspend_enter dpm_suspend_noirq(PMSG_SUSPEND) pci_pm_suspend_noirq # PCI bus .suspend_noirq() method pci_save_state <-- generic PCI pci_prepare_to_sleep <-- generic PCI pci_set_power_state ... dpm_resume_end(PMSG_RESUME) pci_pm_resume # PCI bus .resume() method pci_restore_standard_config pci_set_power_state(PCI_D0) <-- generic PCI pci_restore_state <-- generic PCI mtip_pci_resume # dev->driver->pm->resume mtip_block_resume <-- device-specific [bhelgaas: commit log] Link: https://lore.kernel.org/r/20210114115423.52414-2-vaibhavgupta40@gmail.com Signed-off-by: Vaibhav Gupta Signed-off-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211208192449.146076-4-helgaas@kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 48 +++++++------------------------ 1 file changed, 10 insertions(+), 38 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 4c918c93e2fc..e6005c232328 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4144,30 +4144,17 @@ static void mtip_pci_remove(struct pci_dev *pdev) * 0 Success * <0 Error */ -static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) +static int __maybe_unused mtip_pci_suspend(struct device *dev) { int rv = 0; - struct driver_data *dd = pci_get_drvdata(pdev); + struct driver_data *dd = dev_get_drvdata(dev); set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); /* Disable ports & interrupts then send standby immediate */ rv = mtip_block_suspend(dd); - if (rv < 0) { - dev_err(&pdev->dev, - "Failed to suspend controller\n"); - return rv; - } - - /* - * Save the pci config space to pdev structure & - * disable the device - */ - pci_save_state(pdev); - pci_disable_device(pdev); - - /* Move to Low power state*/ - pci_set_power_state(pdev, PCI_D3hot); + if (rv < 0) + dev_err(dev, "Failed to suspend controller\n"); return rv; } @@ -4179,25 +4166,10 @@ static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) * 0 Success * <0 Error */ -static int mtip_pci_resume(struct pci_dev *pdev) +static int __maybe_unused mtip_pci_resume(struct device *dev) { int rv = 0; - struct driver_data *dd = pci_get_drvdata(pdev); - - /* Move the device to active State */ - pci_set_power_state(pdev, PCI_D0); - - /* Restore PCI configuration space */ - pci_restore_state(pdev); - - /* Enable the PCI device*/ - rv = pcim_enable_device(pdev); - if (rv < 0) { - dev_err(&pdev->dev, - "Failed to enable card during resume\n"); - goto err; - } - pci_set_master(pdev); + struct driver_data *dd = dev_get_drvdata(dev); /* * Calls hbaReset, initPort, & startPort function @@ -4205,9 +4177,8 @@ static int mtip_pci_resume(struct pci_dev *pdev) */ rv = mtip_block_resume(dd); if (rv < 0) - dev_err(&pdev->dev, "Unable to resume\n"); + dev_err(dev, "Unable to resume\n"); -err: clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); return rv; @@ -4238,14 +4209,15 @@ static const struct pci_device_id mtip_pci_tbl[] = { { 0 } }; +static SIMPLE_DEV_PM_OPS(mtip_pci_pm_ops, mtip_pci_suspend, mtip_pci_resume); + /* Structure that describes the PCI driver functions. */ static struct pci_driver mtip_pci_driver = { .name = MTIP_DRV_NAME, .id_table = mtip_pci_tbl, .probe = mtip_pci_probe, .remove = mtip_pci_remove, - .suspend = mtip_pci_suspend, - .resume = mtip_pci_resume, + .driver.pm = &mtip_pci_pm_ops, .shutdown = mtip_pci_shutdown, }; From ac6f6548fcb3c6da8ff1653a16c66fc1719a2a3e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 8 Dec 2021 13:24:49 -0600 Subject: [PATCH 13/32] rsxx: Drop PCI legacy power management The rsxx driver doesn't support device suspend, so remove rsxx_pci_suspend(), the legacy PCI .suspend() method, completely. Signed-off-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20211208192449.146076-5-helgaas@kernel.org Signed-off-by: Jens Axboe --- drivers/block/rsxx/core.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 8d9d69f5dfbc..19b85d16d711 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -1037,12 +1037,6 @@ static void rsxx_pci_remove(struct pci_dev *dev) kfree(card); } -static int rsxx_pci_suspend(struct pci_dev *dev, pm_message_t state) -{ - /* We don't support suspend at this time. */ - return -ENOSYS; -} - static void rsxx_pci_shutdown(struct pci_dev *dev) { struct rsxx_cardinfo *card = pci_get_drvdata(dev); @@ -1083,7 +1077,6 @@ static struct pci_driver rsxx_pci_driver = { .id_table = rsxx_pci_ids, .probe = rsxx_pci_probe, .remove = rsxx_pci_remove, - .suspend = rsxx_pci_suspend, .shutdown = rsxx_pci_shutdown, .err_handler = &rsxx_err_handler, }; From 3427f2b2c533d97bcc57b4237c2af21a8bd2cdbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Dec 2021 09:42:44 +0100 Subject: [PATCH 14/32] block: remove the rsxx driver This driver was for rare and shortlived high end enterprise hardware and hasn't been maintained since 2014, which also means it never got converted to use blk-mq. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- MAINTAINERS | 6 - drivers/block/Kconfig | 11 - drivers/block/Makefile | 1 - drivers/block/rsxx/Makefile | 3 - drivers/block/rsxx/config.c | 197 ------ drivers/block/rsxx/core.c | 1119 -------------------------------- drivers/block/rsxx/cregs.c | 789 ---------------------- drivers/block/rsxx/dev.c | 306 --------- drivers/block/rsxx/dma.c | 1085 ------------------------------- drivers/block/rsxx/rsxx.h | 33 - drivers/block/rsxx/rsxx_cfg.h | 58 -- drivers/block/rsxx/rsxx_priv.h | 418 ------------ 12 files changed, 4026 deletions(-) delete mode 100644 drivers/block/rsxx/Makefile delete mode 100644 drivers/block/rsxx/config.c delete mode 100644 drivers/block/rsxx/core.c delete mode 100644 drivers/block/rsxx/cregs.c delete mode 100644 drivers/block/rsxx/dev.c delete mode 100644 drivers/block/rsxx/dma.c delete mode 100644 drivers/block/rsxx/rsxx.h delete mode 100644 drivers/block/rsxx/rsxx_cfg.h delete mode 100644 drivers/block/rsxx/rsxx_priv.h diff --git a/MAINTAINERS b/MAINTAINERS index 360e9aa0205d..6360f5de36bf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7425,12 +7425,6 @@ F: Documentation/firmware_class/ F: drivers/base/firmware_loader/ F: include/linux/firmware.h -FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card) -M: Joshua Morris -M: Philip Kelleher -S: Maintained -F: drivers/block/rsxx/ - FLEXTIMER FTM-QUADDEC DRIVER M: Patrick Havelange L: linux-iio@vger.kernel.org diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 2a51dfb09c8f..519b6d38d4df 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -392,17 +392,6 @@ config BLK_DEV_RBD If unsure, say N. -config BLK_DEV_RSXX - tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver" - depends on PCI - select CRC32 - help - Device driver for IBM's high speed PCIe SSD - storage device: Flash Adapter 900GB Full Height. - - To compile this driver as a module, choose M here: the - module will be called rsxx. - source "drivers/block/rnbd/Kconfig" endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 11a74f17c9ad..934a9c7c3a7c 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -34,7 +34,6 @@ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ -obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ diff --git a/drivers/block/rsxx/Makefile b/drivers/block/rsxx/Makefile deleted file mode 100644 index 7ef158099d33..000000000000 --- a/drivers/block/rsxx/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_BLK_DEV_RSXX) += rsxx.o -rsxx-objs := config.o core.o cregs.o dev.o dma.o diff --git a/drivers/block/rsxx/config.c b/drivers/block/rsxx/config.c deleted file mode 100644 index 11ed1d9646b9..000000000000 --- a/drivers/block/rsxx/config.c +++ /dev/null @@ -1,197 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* -* Filename: config.c -* -* Authors: Joshua Morris -* Philip Kelleher -* -* (C) Copyright 2013 IBM Corporation -*/ - -#include -#include -#include - -#include "rsxx_priv.h" -#include "rsxx_cfg.h" - -static void initialize_config(struct rsxx_card_cfg *cfg) -{ - cfg->hdr.version = RSXX_CFG_VERSION; - - cfg->data.block_size = RSXX_HW_BLK_SIZE; - cfg->data.stripe_size = RSXX_HW_BLK_SIZE; - cfg->data.vendor_id = RSXX_VENDOR_ID_IBM; - cfg->data.cache_order = (-1); - cfg->data.intr_coal.mode = RSXX_INTR_COAL_DISABLED; - cfg->data.intr_coal.count = 0; - cfg->data.intr_coal.latency = 0; -} - -static u32 config_data_crc32(struct rsxx_card_cfg *cfg) -{ - /* - * Return the compliment of the CRC to ensure compatibility - * (i.e. this is how early rsxx drivers did it.) - */ - - return ~crc32(~0, &cfg->data, sizeof(cfg->data)); -} - - -/*----------------- Config Byte Swap Functions -------------------*/ -static void config_hdr_be_to_cpu(struct card_cfg_hdr *hdr) -{ - hdr->version = be32_to_cpu((__force __be32) hdr->version); - hdr->crc = be32_to_cpu((__force __be32) hdr->crc); -} - -static void config_hdr_cpu_to_be(struct card_cfg_hdr *hdr) -{ - hdr->version = (__force u32) cpu_to_be32(hdr->version); - hdr->crc = (__force u32) cpu_to_be32(hdr->crc); -} - -static void config_data_swab(struct rsxx_card_cfg *cfg) -{ - u32 *data = (u32 *) &cfg->data; - int i; - - for (i = 0; i < (sizeof(cfg->data) / 4); i++) - data[i] = swab32(data[i]); -} - -static void config_data_le_to_cpu(struct rsxx_card_cfg *cfg) -{ - u32 *data = (u32 *) &cfg->data; - int i; - - for (i = 0; i < (sizeof(cfg->data) / 4); i++) - data[i] = le32_to_cpu((__force __le32) data[i]); -} - -static void config_data_cpu_to_le(struct rsxx_card_cfg *cfg) -{ - u32 *data = (u32 *) &cfg->data; - int i; - - for (i = 0; i < (sizeof(cfg->data) / 4); i++) - data[i] = (__force u32) cpu_to_le32(data[i]); -} - - -/*----------------- Config Operations ------------------*/ -static int rsxx_save_config(struct rsxx_cardinfo *card) -{ - struct rsxx_card_cfg cfg; - int st; - - memcpy(&cfg, &card->config, sizeof(cfg)); - - if (unlikely(cfg.hdr.version != RSXX_CFG_VERSION)) { - dev_err(CARD_TO_DEV(card), - "Cannot save config with invalid version %d\n", - cfg.hdr.version); - return -EINVAL; - } - - /* Convert data to little endian for the CRC calculation. */ - config_data_cpu_to_le(&cfg); - - cfg.hdr.crc = config_data_crc32(&cfg); - - /* - * Swap the data from little endian to big endian so it can be - * stored. - */ - config_data_swab(&cfg); - config_hdr_cpu_to_be(&cfg.hdr); - - st = rsxx_creg_write(card, CREG_ADD_CONFIG, sizeof(cfg), &cfg, 1); - if (st) - return st; - - return 0; -} - -int rsxx_load_config(struct rsxx_cardinfo *card) -{ - int st; - u32 crc; - - st = rsxx_creg_read(card, CREG_ADD_CONFIG, sizeof(card->config), - &card->config, 1); - if (st) { - dev_err(CARD_TO_DEV(card), - "Failed reading card config.\n"); - return st; - } - - config_hdr_be_to_cpu(&card->config.hdr); - - if (card->config.hdr.version == RSXX_CFG_VERSION) { - /* - * We calculate the CRC with the data in little endian, because - * early drivers did not take big endian CPUs into account. - * The data is always stored in big endian, so we need to byte - * swap it before calculating the CRC. - */ - - config_data_swab(&card->config); - - /* Check the CRC */ - crc = config_data_crc32(&card->config); - if (crc != card->config.hdr.crc) { - dev_err(CARD_TO_DEV(card), - "Config corruption detected!\n"); - dev_info(CARD_TO_DEV(card), - "CRC (sb x%08x is x%08x)\n", - card->config.hdr.crc, crc); - return -EIO; - } - - /* Convert the data to CPU byteorder */ - config_data_le_to_cpu(&card->config); - - } else if (card->config.hdr.version != 0) { - dev_err(CARD_TO_DEV(card), - "Invalid config version %d.\n", - card->config.hdr.version); - /* - * Config version changes require special handling from the - * user - */ - return -EINVAL; - } else { - dev_info(CARD_TO_DEV(card), - "Initializing card configuration.\n"); - initialize_config(&card->config); - st = rsxx_save_config(card); - if (st) - return st; - } - - card->config_valid = 1; - - dev_dbg(CARD_TO_DEV(card), "version: x%08x\n", - card->config.hdr.version); - dev_dbg(CARD_TO_DEV(card), "crc: x%08x\n", - card->config.hdr.crc); - dev_dbg(CARD_TO_DEV(card), "block_size: x%08x\n", - card->config.data.block_size); - dev_dbg(CARD_TO_DEV(card), "stripe_size: x%08x\n", - card->config.data.stripe_size); - dev_dbg(CARD_TO_DEV(card), "vendor_id: x%08x\n", - card->config.data.vendor_id); - dev_dbg(CARD_TO_DEV(card), "cache_order: x%08x\n", - card->config.data.cache_order); - dev_dbg(CARD_TO_DEV(card), "mode: x%08x\n", - card->config.data.intr_coal.mode); - dev_dbg(CARD_TO_DEV(card), "count: x%08x\n", - card->config.data.intr_coal.count); - dev_dbg(CARD_TO_DEV(card), "latency: x%08x\n", - card->config.data.intr_coal.latency); - - return 0; -} - diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c deleted file mode 100644 index 19b85d16d711..000000000000 --- a/drivers/block/rsxx/core.c +++ /dev/null @@ -1,1119 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* -* Filename: core.c -* -* Authors: Joshua Morris -* Philip Kelleher -* -* (C) Copyright 2013 IBM Corporation -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "rsxx_priv.h" -#include "rsxx_cfg.h" - -#define NO_LEGACY 0 -#define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */ - -MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver"); -MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); -MODULE_LICENSE("GPL"); -MODULE_VERSION(DRIVER_VERSION); - -static unsigned int force_legacy = NO_LEGACY; -module_param(force_legacy, uint, 0444); -MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); - -static unsigned int sync_start = 1; -module_param(sync_start, uint, 0444); -MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete " - "until the card startup has completed."); - -static DEFINE_IDA(rsxx_disk_ida); - -/* --------------------Debugfs Setup ------------------- */ - -static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p) -{ - struct rsxx_cardinfo *card = m->private; - - seq_printf(m, "HWID 0x%08x\n", - ioread32(card->regmap + HWID)); - seq_printf(m, "SCRATCH 0x%08x\n", - ioread32(card->regmap + SCRATCH)); - seq_printf(m, "IER 0x%08x\n", - ioread32(card->regmap + IER)); - seq_printf(m, "IPR 0x%08x\n", - ioread32(card->regmap + IPR)); - seq_printf(m, "CREG_CMD 0x%08x\n", - ioread32(card->regmap + CREG_CMD)); - seq_printf(m, "CREG_ADD 0x%08x\n", - ioread32(card->regmap + CREG_ADD)); - seq_printf(m, "CREG_CNT 0x%08x\n", - ioread32(card->regmap + CREG_CNT)); - seq_printf(m, "CREG_STAT 0x%08x\n", - ioread32(card->regmap + CREG_STAT)); - seq_printf(m, "CREG_DATA0 0x%08x\n", - ioread32(card->regmap + CREG_DATA0)); - seq_printf(m, "CREG_DATA1 0x%08x\n", - ioread32(card->regmap + CREG_DATA1)); - seq_printf(m, "CREG_DATA2 0x%08x\n", - ioread32(card->regmap + CREG_DATA2)); - seq_printf(m, "CREG_DATA3 0x%08x\n", - ioread32(card->regmap + CREG_DATA3)); - seq_printf(m, "CREG_DATA4 0x%08x\n", - ioread32(card->regmap + CREG_DATA4)); - seq_printf(m, "CREG_DATA5 0x%08x\n", - ioread32(card->regmap + CREG_DATA5)); - seq_printf(m, "CREG_DATA6 0x%08x\n", - ioread32(card->regmap + CREG_DATA6)); - seq_printf(m, "CREG_DATA7 0x%08x\n", - ioread32(card->regmap + CREG_DATA7)); - seq_printf(m, "INTR_COAL 0x%08x\n", - ioread32(card->regmap + INTR_COAL)); - seq_printf(m, "HW_ERROR 0x%08x\n", - ioread32(card->regmap + HW_ERROR)); - seq_printf(m, "DEBUG0 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG0)); - seq_printf(m, "DEBUG1 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG1)); - seq_printf(m, "DEBUG2 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG2)); - seq_printf(m, "DEBUG3 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG3)); - seq_printf(m, "DEBUG4 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG4)); - seq_printf(m, "DEBUG5 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG5)); - seq_printf(m, "DEBUG6 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG6)); - seq_printf(m, "DEBUG7 0x%08x\n", - ioread32(card->regmap + PCI_DEBUG7)); - seq_printf(m, "RECONFIG 0x%08x\n", - ioread32(card->regmap + PCI_RECONFIG)); - - return 0; -} - -static int rsxx_attr_stats_show(struct seq_file *m, void *p) -{ - struct rsxx_cardinfo *card = m->private; - int i; - - for (i = 0; i < card->n_targets; i++) { - seq_printf(m, "Ctrl %d CRC Errors = %d\n", - i, card->ctrl[i].stats.crc_errors); - seq_printf(m, "Ctrl %d Hard Errors = %d\n", - i, card->ctrl[i].stats.hard_errors); - seq_printf(m, "Ctrl %d Soft Errors = %d\n", - i, card->ctrl[i].stats.soft_errors); - seq_printf(m, "Ctrl %d Writes Issued = %d\n", - i, card->ctrl[i].stats.writes_issued); - seq_printf(m, "Ctrl %d Writes Failed = %d\n", - i, card->ctrl[i].stats.writes_failed); - seq_printf(m, "Ctrl %d Reads Issued = %d\n", - i, card->ctrl[i].stats.reads_issued); - seq_printf(m, "Ctrl %d Reads Failed = %d\n", - i, card->ctrl[i].stats.reads_failed); - seq_printf(m, "Ctrl %d Reads Retried = %d\n", - i, card->ctrl[i].stats.reads_retried); - seq_printf(m, "Ctrl %d Discards Issued = %d\n", - i, card->ctrl[i].stats.discards_issued); - seq_printf(m, "Ctrl %d Discards Failed = %d\n", - i, card->ctrl[i].stats.discards_failed); - seq_printf(m, "Ctrl %d DMA SW Errors = %d\n", - i, card->ctrl[i].stats.dma_sw_err); - seq_printf(m, "Ctrl %d DMA HW Faults = %d\n", - i, card->ctrl[i].stats.dma_hw_fault); - seq_printf(m, "Ctrl %d DMAs Cancelled = %d\n", - i, card->ctrl[i].stats.dma_cancelled); - seq_printf(m, "Ctrl %d SW Queue Depth = %d\n", - i, card->ctrl[i].stats.sw_q_depth); - seq_printf(m, "Ctrl %d HW Queue Depth = %d\n", - i, atomic_read(&card->ctrl[i].stats.hw_q_depth)); - } - - return 0; -} - -static int rsxx_attr_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, rsxx_attr_stats_show, inode->i_private); -} - -static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file) -{ - return single_open(file, rsxx_attr_pci_regs_show, inode->i_private); -} - -static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct rsxx_cardinfo *card = file_inode(fp)->i_private; - char *buf; - int st; - - buf = kzalloc(cnt, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - st = rsxx_creg_read(card, CREG_ADD_CRAM + (u32)*ppos, cnt, buf, 1); - if (!st) { - if (copy_to_user(ubuf, buf, cnt)) - st = -EFAULT; - } - kfree(buf); - if (st) - return st; - *ppos += cnt; - return cnt; -} - -static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct rsxx_cardinfo *card = file_inode(fp)->i_private; - char *buf; - ssize_t st; - - buf = memdup_user(ubuf, cnt); - if (IS_ERR(buf)) - return PTR_ERR(buf); - - st = rsxx_creg_write(card, CREG_ADD_CRAM + (u32)*ppos, cnt, buf, 1); - kfree(buf); - if (st) - return st; - *ppos += cnt; - return cnt; -} - -static const struct file_operations debugfs_cram_fops = { - .owner = THIS_MODULE, - .read = rsxx_cram_read, - .write = rsxx_cram_write, -}; - -static const struct file_operations debugfs_stats_fops = { - .owner = THIS_MODULE, - .open = rsxx_attr_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static const struct file_operations debugfs_pci_regs_fops = { - .owner = THIS_MODULE, - .open = rsxx_attr_pci_regs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card) -{ - struct dentry *debugfs_stats; - struct dentry *debugfs_pci_regs; - struct dentry *debugfs_cram; - - card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL); - if (IS_ERR_OR_NULL(card->debugfs_dir)) - goto failed_debugfs_dir; - - debugfs_stats = debugfs_create_file("stats", 0444, - card->debugfs_dir, card, - &debugfs_stats_fops); - if (IS_ERR_OR_NULL(debugfs_stats)) - goto failed_debugfs_stats; - - debugfs_pci_regs = debugfs_create_file("pci_regs", 0444, - card->debugfs_dir, card, - &debugfs_pci_regs_fops); - if (IS_ERR_OR_NULL(debugfs_pci_regs)) - goto failed_debugfs_pci_regs; - - debugfs_cram = debugfs_create_file("cram", 0644, - card->debugfs_dir, card, - &debugfs_cram_fops); - if (IS_ERR_OR_NULL(debugfs_cram)) - goto failed_debugfs_cram; - - return; -failed_debugfs_cram: - debugfs_remove(debugfs_pci_regs); -failed_debugfs_pci_regs: - debugfs_remove(debugfs_stats); -failed_debugfs_stats: - debugfs_remove(card->debugfs_dir); -failed_debugfs_dir: - card->debugfs_dir = NULL; -} - -/*----------------- Interrupt Control & Handling -------------------*/ - -static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) -{ - card->isr_mask = 0; - card->ier_mask = 0; -} - -static void __enable_intr(unsigned int *mask, unsigned int intr) -{ - *mask |= intr; -} - -static void __disable_intr(unsigned int *mask, unsigned int intr) -{ - *mask &= ~intr; -} - -/* - * NOTE: Disabling the IER will disable the hardware interrupt. - * Disabling the ISR will disable the software handling of the ISR bit. - * - * Enable/Disable interrupt functions assume the card->irq_lock - * is held by the caller. - */ -void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr) -{ - if (unlikely(card->halt) || - unlikely(card->eeh_state)) - return; - - __enable_intr(&card->ier_mask, intr); - iowrite32(card->ier_mask, card->regmap + IER); -} - -void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr) -{ - if (unlikely(card->eeh_state)) - return; - - __disable_intr(&card->ier_mask, intr); - iowrite32(card->ier_mask, card->regmap + IER); -} - -void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, - unsigned int intr) -{ - if (unlikely(card->halt) || - unlikely(card->eeh_state)) - return; - - __enable_intr(&card->isr_mask, intr); - __enable_intr(&card->ier_mask, intr); - iowrite32(card->ier_mask, card->regmap + IER); -} -void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, - unsigned int intr) -{ - if (unlikely(card->eeh_state)) - return; - - __disable_intr(&card->isr_mask, intr); - __disable_intr(&card->ier_mask, intr); - iowrite32(card->ier_mask, card->regmap + IER); -} - -static irqreturn_t rsxx_isr(int irq, void *pdata) -{ - struct rsxx_cardinfo *card = pdata; - unsigned int isr; - int handled = 0; - int reread_isr; - int i; - - spin_lock(&card->irq_lock); - - do { - reread_isr = 0; - - if (unlikely(card->eeh_state)) - break; - - isr = ioread32(card->regmap + ISR); - if (isr == 0xffffffff) { - /* - * A few systems seem to have an intermittent issue - * where PCI reads return all Fs, but retrying the read - * a little later will return as expected. - */ - dev_info(CARD_TO_DEV(card), - "ISR = 0xFFFFFFFF, retrying later\n"); - break; - } - - isr &= card->isr_mask; - if (!isr) - break; - - for (i = 0; i < card->n_targets; i++) { - if (isr & CR_INTR_DMA(i)) { - if (card->ier_mask & CR_INTR_DMA(i)) { - rsxx_disable_ier(card, CR_INTR_DMA(i)); - reread_isr = 1; - } - queue_work(card->ctrl[i].done_wq, - &card->ctrl[i].dma_done_work); - handled++; - } - } - - if (isr & CR_INTR_CREG) { - queue_work(card->creg_ctrl.creg_wq, - &card->creg_ctrl.done_work); - handled++; - } - - if (isr & CR_INTR_EVENT) { - queue_work(card->event_wq, &card->event_work); - rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); - handled++; - } - } while (reread_isr); - - spin_unlock(&card->irq_lock); - - return handled ? IRQ_HANDLED : IRQ_NONE; -} - -/*----------------- Card Event Handler -------------------*/ -static const char *rsxx_card_state_to_str(unsigned int state) -{ - static const char * const state_strings[] = { - "Unknown", "Shutdown", "Starting", "Formatting", - "Uninitialized", "Good", "Shutting Down", - "Fault", "Read Only Fault", "dStroying" - }; - - return state_strings[ffs(state)]; -} - -static void card_state_change(struct rsxx_cardinfo *card, - unsigned int new_state) -{ - int st; - - dev_info(CARD_TO_DEV(card), - "card state change detected.(%s -> %s)\n", - rsxx_card_state_to_str(card->state), - rsxx_card_state_to_str(new_state)); - - card->state = new_state; - - /* Don't attach DMA interfaces if the card has an invalid config */ - if (!card->config_valid) - return; - - switch (new_state) { - case CARD_STATE_RD_ONLY_FAULT: - dev_crit(CARD_TO_DEV(card), - "Hardware has entered read-only mode!\n"); - /* - * Fall through so the DMA devices can be attached and - * the user can attempt to pull off their data. - */ - fallthrough; - case CARD_STATE_GOOD: - st = rsxx_get_card_size8(card, &card->size8); - if (st) - dev_err(CARD_TO_DEV(card), - "Failed attaching DMA devices\n"); - - if (card->config_valid) - set_capacity(card->gendisk, card->size8 >> 9); - break; - - case CARD_STATE_FAULT: - dev_crit(CARD_TO_DEV(card), - "Hardware Fault reported!\n"); - fallthrough; - - /* Everything else, detach DMA interface if it's attached. */ - case CARD_STATE_SHUTDOWN: - case CARD_STATE_STARTING: - case CARD_STATE_FORMATTING: - case CARD_STATE_UNINITIALIZED: - case CARD_STATE_SHUTTING_DOWN: - /* - * dStroy is a term coined by marketing to represent the low level - * secure erase. - */ - case CARD_STATE_DSTROYING: - set_capacity(card->gendisk, 0); - break; - } -} - -static void card_event_handler(struct work_struct *work) -{ - struct rsxx_cardinfo *card; - unsigned int state; - unsigned long flags; - int st; - - card = container_of(work, struct rsxx_cardinfo, event_work); - - if (unlikely(card->halt)) - return; - - /* - * Enable the interrupt now to avoid any weird race conditions where a - * state change might occur while rsxx_get_card_state() is - * processing a returned creg cmd. - */ - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_enable_ier_and_isr(card, CR_INTR_EVENT); - spin_unlock_irqrestore(&card->irq_lock, flags); - - st = rsxx_get_card_state(card, &state); - if (st) { - dev_info(CARD_TO_DEV(card), - "Failed reading state after event.\n"); - return; - } - - if (card->state != state) - card_state_change(card, state); - - if (card->creg_ctrl.creg_stats.stat & CREG_STAT_LOG_PENDING) - rsxx_read_hw_log(card); -} - -/*----------------- Card Operations -------------------*/ -static int card_shutdown(struct rsxx_cardinfo *card) -{ - unsigned int state; - signed long start; - const int timeout = msecs_to_jiffies(120000); - int st; - - /* We can't issue a shutdown if the card is in a transition state */ - start = jiffies; - do { - st = rsxx_get_card_state(card, &state); - if (st) - return st; - } while (state == CARD_STATE_STARTING && - (jiffies - start < timeout)); - - if (state == CARD_STATE_STARTING) - return -ETIMEDOUT; - - /* Only issue a shutdown if we need to */ - if ((state != CARD_STATE_SHUTTING_DOWN) && - (state != CARD_STATE_SHUTDOWN)) { - st = rsxx_issue_card_cmd(card, CARD_CMD_SHUTDOWN); - if (st) - return st; - } - - start = jiffies; - do { - st = rsxx_get_card_state(card, &state); - if (st) - return st; - } while (state != CARD_STATE_SHUTDOWN && - (jiffies - start < timeout)); - - if (state != CARD_STATE_SHUTDOWN) - return -ETIMEDOUT; - - return 0; -} - -static int rsxx_eeh_frozen(struct pci_dev *dev) -{ - struct rsxx_cardinfo *card = pci_get_drvdata(dev); - int i; - int st; - - dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n"); - - card->eeh_state = 1; - rsxx_mask_interrupts(card); - - /* - * We need to guarantee that the write for eeh_state and masking - * interrupts does not become reordered. This will prevent a possible - * race condition with the EEH code. - */ - wmb(); - - pci_disable_device(dev); - - st = rsxx_eeh_save_issued_dmas(card); - if (st) - return st; - - rsxx_eeh_save_issued_creg(card); - - for (i = 0; i < card->n_targets; i++) { - if (card->ctrl[i].status.buf) - dma_free_coherent(&card->dev->dev, - STATUS_BUFFER_SIZE8, - card->ctrl[i].status.buf, - card->ctrl[i].status.dma_addr); - if (card->ctrl[i].cmd.buf) - dma_free_coherent(&card->dev->dev, - COMMAND_BUFFER_SIZE8, - card->ctrl[i].cmd.buf, - card->ctrl[i].cmd.dma_addr); - } - - return 0; -} - -static void rsxx_eeh_failure(struct pci_dev *dev) -{ - struct rsxx_cardinfo *card = pci_get_drvdata(dev); - int i; - int cnt = 0; - - dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n"); - - card->eeh_state = 1; - card->halt = 1; - - for (i = 0; i < card->n_targets; i++) { - spin_lock_bh(&card->ctrl[i].queue_lock); - cnt = rsxx_cleanup_dma_queue(&card->ctrl[i], - &card->ctrl[i].queue, - COMPLETE_DMA); - spin_unlock_bh(&card->ctrl[i].queue_lock); - - cnt += rsxx_dma_cancel(&card->ctrl[i]); - - if (cnt) - dev_info(CARD_TO_DEV(card), - "Freed %d queued DMAs on channel %d\n", - cnt, card->ctrl[i].id); - } -} - -static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) -{ - unsigned int status; - int iter = 0; - - /* We need to wait for the hardware to reset */ - while (iter++ < 10) { - status = ioread32(card->regmap + PCI_RECONFIG); - - if (status & RSXX_FLUSH_BUSY) { - ssleep(1); - continue; - } - - if (status & RSXX_FLUSH_TIMEOUT) - dev_warn(CARD_TO_DEV(card), "HW: flash controller timeout\n"); - return 0; - } - - /* Hardware failed resetting itself. */ - return -1; -} - -static pci_ers_result_t rsxx_error_detected(struct pci_dev *dev, - pci_channel_state_t error) -{ - int st; - - if (dev->revision < RSXX_EEH_SUPPORT) - return PCI_ERS_RESULT_NONE; - - if (error == pci_channel_io_perm_failure) { - rsxx_eeh_failure(dev); - return PCI_ERS_RESULT_DISCONNECT; - } - - st = rsxx_eeh_frozen(dev); - if (st) { - dev_err(&dev->dev, "Slot reset setup failed\n"); - rsxx_eeh_failure(dev); - return PCI_ERS_RESULT_DISCONNECT; - } - - return PCI_ERS_RESULT_NEED_RESET; -} - -static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) -{ - struct rsxx_cardinfo *card = pci_get_drvdata(dev); - unsigned long flags; - int i; - int st; - - dev_warn(&dev->dev, - "IBM Flash Adapter PCI: recovering from slot reset.\n"); - - st = pci_enable_device(dev); - if (st) - goto failed_hw_setup; - - pci_set_master(dev); - - st = rsxx_eeh_fifo_flush_poll(card); - if (st) - goto failed_hw_setup; - - rsxx_dma_queue_reset(card); - - for (i = 0; i < card->n_targets; i++) { - st = rsxx_hw_buffers_init(dev, &card->ctrl[i]); - if (st) - goto failed_hw_buffers_init; - } - - if (card->config_valid) - rsxx_dma_configure(card); - - /* Clears the ISR register from spurious interrupts */ - st = ioread32(card->regmap + ISR); - - card->eeh_state = 0; - - spin_lock_irqsave(&card->irq_lock, flags); - if (card->n_targets & RSXX_MAX_TARGETS) - rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G); - else - rsxx_enable_ier_and_isr(card, CR_INTR_ALL_C); - spin_unlock_irqrestore(&card->irq_lock, flags); - - rsxx_kick_creg_queue(card); - - for (i = 0; i < card->n_targets; i++) { - spin_lock(&card->ctrl[i].queue_lock); - if (list_empty(&card->ctrl[i].queue)) { - spin_unlock(&card->ctrl[i].queue_lock); - continue; - } - spin_unlock(&card->ctrl[i].queue_lock); - - queue_work(card->ctrl[i].issue_wq, - &card->ctrl[i].issue_dma_work); - } - - dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n"); - - return PCI_ERS_RESULT_RECOVERED; - -failed_hw_buffers_init: - for (i = 0; i < card->n_targets; i++) { - if (card->ctrl[i].status.buf) - dma_free_coherent(&card->dev->dev, - STATUS_BUFFER_SIZE8, - card->ctrl[i].status.buf, - card->ctrl[i].status.dma_addr); - if (card->ctrl[i].cmd.buf) - dma_free_coherent(&card->dev->dev, - COMMAND_BUFFER_SIZE8, - card->ctrl[i].cmd.buf, - card->ctrl[i].cmd.dma_addr); - } -failed_hw_setup: - rsxx_eeh_failure(dev); - return PCI_ERS_RESULT_DISCONNECT; - -} - -/*----------------- Driver Initialization & Setup -------------------*/ -/* Returns: 0 if the driver is compatible with the device - -1 if the driver is NOT compatible with the device */ -static int rsxx_compatibility_check(struct rsxx_cardinfo *card) -{ - unsigned char pci_rev; - - pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev); - - if (pci_rev > RS70_PCI_REV_SUPPORTED) - return -1; - return 0; -} - -static int rsxx_pci_probe(struct pci_dev *dev, - const struct pci_device_id *id) -{ - struct rsxx_cardinfo *card; - int st; - unsigned int sync_timeout; - - dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); - - card = kzalloc(sizeof(*card), GFP_KERNEL); - if (!card) - return -ENOMEM; - - card->dev = dev; - pci_set_drvdata(dev, card); - - st = ida_alloc(&rsxx_disk_ida, GFP_KERNEL); - if (st < 0) - goto failed_ida_get; - card->disk_id = st; - - st = pci_enable_device(dev); - if (st) - goto failed_enable; - - pci_set_master(dev); - - st = dma_set_mask(&dev->dev, DMA_BIT_MASK(64)); - if (st) { - dev_err(CARD_TO_DEV(card), - "No usable DMA configuration,aborting\n"); - goto failed_dma_mask; - } - - st = pci_request_regions(dev, DRIVER_NAME); - if (st) { - dev_err(CARD_TO_DEV(card), - "Failed to request memory region\n"); - goto failed_request_regions; - } - - if (pci_resource_len(dev, 0) == 0) { - dev_err(CARD_TO_DEV(card), "BAR0 has length 0!\n"); - st = -ENOMEM; - goto failed_iomap; - } - - card->regmap = pci_iomap(dev, 0, 0); - if (!card->regmap) { - dev_err(CARD_TO_DEV(card), "Failed to map BAR0\n"); - st = -ENOMEM; - goto failed_iomap; - } - - spin_lock_init(&card->irq_lock); - card->halt = 0; - card->eeh_state = 0; - - spin_lock_irq(&card->irq_lock); - rsxx_disable_ier_and_isr(card, CR_INTR_ALL); - spin_unlock_irq(&card->irq_lock); - - if (!force_legacy) { - st = pci_enable_msi(dev); - if (st) - dev_warn(CARD_TO_DEV(card), - "Failed to enable MSI\n"); - } - - st = request_irq(dev->irq, rsxx_isr, IRQF_SHARED, - DRIVER_NAME, card); - if (st) { - dev_err(CARD_TO_DEV(card), - "Failed requesting IRQ%d\n", dev->irq); - goto failed_irq; - } - - /************* Setup Processor Command Interface *************/ - st = rsxx_creg_setup(card); - if (st) { - dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n"); - goto failed_creg_setup; - } - - spin_lock_irq(&card->irq_lock); - rsxx_enable_ier_and_isr(card, CR_INTR_CREG); - spin_unlock_irq(&card->irq_lock); - - st = rsxx_compatibility_check(card); - if (st) { - dev_warn(CARD_TO_DEV(card), - "Incompatible driver detected. Please update the driver.\n"); - st = -EINVAL; - goto failed_compatiblity_check; - } - - /************* Load Card Config *************/ - st = rsxx_load_config(card); - if (st) - dev_err(CARD_TO_DEV(card), - "Failed loading card config\n"); - - /************* Setup DMA Engine *************/ - st = rsxx_get_num_targets(card, &card->n_targets); - if (st) - dev_info(CARD_TO_DEV(card), - "Failed reading the number of DMA targets\n"); - - card->ctrl = kcalloc(card->n_targets, sizeof(*card->ctrl), - GFP_KERNEL); - if (!card->ctrl) { - st = -ENOMEM; - goto failed_dma_setup; - } - - st = rsxx_dma_setup(card); - if (st) { - dev_info(CARD_TO_DEV(card), - "Failed to setup DMA engine\n"); - goto failed_dma_setup; - } - - /************* Setup Card Event Handler *************/ - card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event"); - if (!card->event_wq) { - dev_err(CARD_TO_DEV(card), "Failed card event setup.\n"); - st = -ENOMEM; - goto failed_event_handler; - } - - INIT_WORK(&card->event_work, card_event_handler); - - st = rsxx_setup_dev(card); - if (st) - goto failed_create_dev; - - rsxx_get_card_state(card, &card->state); - - dev_info(CARD_TO_DEV(card), - "card state: %s\n", - rsxx_card_state_to_str(card->state)); - - /* - * Now that the DMA Engine and devices have been setup, - * we can enable the event interrupt(it kicks off actions in - * those layers so we couldn't enable it right away.) - */ - spin_lock_irq(&card->irq_lock); - rsxx_enable_ier_and_isr(card, CR_INTR_EVENT); - spin_unlock_irq(&card->irq_lock); - - if (card->state == CARD_STATE_SHUTDOWN) { - st = rsxx_issue_card_cmd(card, CARD_CMD_STARTUP); - if (st) - dev_crit(CARD_TO_DEV(card), - "Failed issuing card startup\n"); - if (sync_start) { - sync_timeout = SYNC_START_TIMEOUT; - - dev_info(CARD_TO_DEV(card), - "Waiting for card to startup\n"); - - do { - ssleep(1); - sync_timeout--; - - rsxx_get_card_state(card, &card->state); - } while (sync_timeout && - (card->state == CARD_STATE_STARTING)); - - if (card->state == CARD_STATE_STARTING) { - dev_warn(CARD_TO_DEV(card), - "Card startup timed out\n"); - card->size8 = 0; - } else { - dev_info(CARD_TO_DEV(card), - "card state: %s\n", - rsxx_card_state_to_str(card->state)); - st = rsxx_get_card_size8(card, &card->size8); - if (st) - card->size8 = 0; - } - } - } else if (card->state == CARD_STATE_GOOD || - card->state == CARD_STATE_RD_ONLY_FAULT) { - st = rsxx_get_card_size8(card, &card->size8); - if (st) - card->size8 = 0; - } - - st = rsxx_attach_dev(card); - if (st) - goto failed_create_dev; - - /************* Setup Debugfs *************/ - rsxx_debugfs_dev_new(card); - - return 0; - -failed_create_dev: - destroy_workqueue(card->event_wq); - card->event_wq = NULL; -failed_event_handler: - rsxx_dma_destroy(card); -failed_dma_setup: -failed_compatiblity_check: - destroy_workqueue(card->creg_ctrl.creg_wq); - card->creg_ctrl.creg_wq = NULL; -failed_creg_setup: - spin_lock_irq(&card->irq_lock); - rsxx_disable_ier_and_isr(card, CR_INTR_ALL); - spin_unlock_irq(&card->irq_lock); - free_irq(dev->irq, card); - if (!force_legacy) - pci_disable_msi(dev); -failed_irq: - pci_iounmap(dev, card->regmap); -failed_iomap: - pci_release_regions(dev); -failed_request_regions: -failed_dma_mask: - pci_disable_device(dev); -failed_enable: - ida_free(&rsxx_disk_ida, card->disk_id); -failed_ida_get: - kfree(card); - - return st; -} - -static void rsxx_pci_remove(struct pci_dev *dev) -{ - struct rsxx_cardinfo *card = pci_get_drvdata(dev); - unsigned long flags; - int st; - int i; - - if (!card) - return; - - dev_info(CARD_TO_DEV(card), - "Removing PCI-Flash SSD.\n"); - - rsxx_detach_dev(card); - - for (i = 0; i < card->n_targets; i++) { - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i)); - spin_unlock_irqrestore(&card->irq_lock, flags); - } - - st = card_shutdown(card); - if (st) - dev_crit(CARD_TO_DEV(card), "Shutdown failed!\n"); - - /* Sync outstanding event handlers. */ - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); - spin_unlock_irqrestore(&card->irq_lock, flags); - - cancel_work_sync(&card->event_work); - - destroy_workqueue(card->event_wq); - rsxx_destroy_dev(card); - rsxx_dma_destroy(card); - destroy_workqueue(card->creg_ctrl.creg_wq); - - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_disable_ier_and_isr(card, CR_INTR_ALL); - spin_unlock_irqrestore(&card->irq_lock, flags); - - /* Prevent work_structs from re-queuing themselves. */ - card->halt = 1; - - debugfs_remove_recursive(card->debugfs_dir); - - free_irq(dev->irq, card); - - if (!force_legacy) - pci_disable_msi(dev); - - rsxx_creg_destroy(card); - - pci_iounmap(dev, card->regmap); - - pci_disable_device(dev); - pci_release_regions(dev); - - ida_free(&rsxx_disk_ida, card->disk_id); - kfree(card); -} - -static void rsxx_pci_shutdown(struct pci_dev *dev) -{ - struct rsxx_cardinfo *card = pci_get_drvdata(dev); - unsigned long flags; - int i; - - if (!card) - return; - - dev_info(CARD_TO_DEV(card), "Shutting down PCI-Flash SSD.\n"); - - rsxx_detach_dev(card); - - for (i = 0; i < card->n_targets; i++) { - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i)); - spin_unlock_irqrestore(&card->irq_lock, flags); - } - - card_shutdown(card); -} - -static const struct pci_error_handlers rsxx_err_handler = { - .error_detected = rsxx_error_detected, - .slot_reset = rsxx_slot_reset, -}; - -static const struct pci_device_id rsxx_pci_ids[] = { - {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS70_FLASH)}, - {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS80_FLASH)}, - {0,}, -}; - -MODULE_DEVICE_TABLE(pci, rsxx_pci_ids); - -static struct pci_driver rsxx_pci_driver = { - .name = DRIVER_NAME, - .id_table = rsxx_pci_ids, - .probe = rsxx_pci_probe, - .remove = rsxx_pci_remove, - .shutdown = rsxx_pci_shutdown, - .err_handler = &rsxx_err_handler, -}; - -static int __init rsxx_core_init(void) -{ - int st; - - st = rsxx_dev_init(); - if (st) - return st; - - st = rsxx_dma_init(); - if (st) - goto dma_init_failed; - - st = rsxx_creg_init(); - if (st) - goto creg_init_failed; - - return pci_register_driver(&rsxx_pci_driver); - -creg_init_failed: - rsxx_dma_cleanup(); -dma_init_failed: - rsxx_dev_cleanup(); - - return st; -} - -static void __exit rsxx_core_cleanup(void) -{ - pci_unregister_driver(&rsxx_pci_driver); - rsxx_creg_cleanup(); - rsxx_dma_cleanup(); - rsxx_dev_cleanup(); -} - -module_init(rsxx_core_init); -module_exit(rsxx_core_cleanup); diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c deleted file mode 100644 index 60ecd3f7cbd2..000000000000 --- a/drivers/block/rsxx/cregs.c +++ /dev/null @@ -1,789 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* -* Filename: cregs.c -* -* Authors: Joshua Morris -* Philip Kelleher -* -* (C) Copyright 2013 IBM Corporation -*/ - -#include -#include - -#include "rsxx_priv.h" - -#define CREG_TIMEOUT_MSEC 10000 - -typedef void (*creg_cmd_cb)(struct rsxx_cardinfo *card, - struct creg_cmd *cmd, - int st); - -struct creg_cmd { - struct list_head list; - creg_cmd_cb cb; - void *cb_private; - unsigned int op; - unsigned int addr; - int cnt8; - void *buf; - unsigned int stream; - unsigned int status; -}; - -static struct kmem_cache *creg_cmd_pool; - - -/*------------ Private Functions --------------*/ - -#if defined(__LITTLE_ENDIAN) -#define LITTLE_ENDIAN 1 -#elif defined(__BIG_ENDIAN) -#define LITTLE_ENDIAN 0 -#else -#error Unknown endianess!!! Aborting... -#endif - -static int copy_to_creg_data(struct rsxx_cardinfo *card, - int cnt8, - void *buf, - unsigned int stream) -{ - int i = 0; - u32 *data = buf; - - if (unlikely(card->eeh_state)) - return -EIO; - - for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { - /* - * Firmware implementation makes it necessary to byte swap on - * little endian processors. - */ - if (LITTLE_ENDIAN && stream) - iowrite32be(data[i], card->regmap + CREG_DATA(i)); - else - iowrite32(data[i], card->regmap + CREG_DATA(i)); - } - - return 0; -} - - -static int copy_from_creg_data(struct rsxx_cardinfo *card, - int cnt8, - void *buf, - unsigned int stream) -{ - int i = 0; - u32 *data = buf; - - if (unlikely(card->eeh_state)) - return -EIO; - - for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { - /* - * Firmware implementation makes it necessary to byte swap on - * little endian processors. - */ - if (LITTLE_ENDIAN && stream) - data[i] = ioread32be(card->regmap + CREG_DATA(i)); - else - data[i] = ioread32(card->regmap + CREG_DATA(i)); - } - - return 0; -} - -static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd) -{ - int st; - - if (unlikely(card->eeh_state)) - return; - - iowrite32(cmd->addr, card->regmap + CREG_ADD); - iowrite32(cmd->cnt8, card->regmap + CREG_CNT); - - if (cmd->op == CREG_OP_WRITE) { - if (cmd->buf) { - st = copy_to_creg_data(card, cmd->cnt8, - cmd->buf, cmd->stream); - if (st) - return; - } - } - - if (unlikely(card->eeh_state)) - return; - - /* Setting the valid bit will kick off the command. */ - iowrite32(cmd->op, card->regmap + CREG_CMD); -} - -static void creg_kick_queue(struct rsxx_cardinfo *card) -{ - if (card->creg_ctrl.active || list_empty(&card->creg_ctrl.queue)) - return; - - card->creg_ctrl.active = 1; - card->creg_ctrl.active_cmd = list_first_entry(&card->creg_ctrl.queue, - struct creg_cmd, list); - list_del(&card->creg_ctrl.active_cmd->list); - card->creg_ctrl.q_depth--; - - /* - * We have to set the timer before we push the new command. Otherwise, - * we could create a race condition that would occur if the timer - * was not canceled, and expired after the new command was pushed, - * but before the command was issued to hardware. - */ - mod_timer(&card->creg_ctrl.cmd_timer, - jiffies + msecs_to_jiffies(CREG_TIMEOUT_MSEC)); - - creg_issue_cmd(card, card->creg_ctrl.active_cmd); -} - -static int creg_queue_cmd(struct rsxx_cardinfo *card, - unsigned int op, - unsigned int addr, - unsigned int cnt8, - void *buf, - int stream, - creg_cmd_cb callback, - void *cb_private) -{ - struct creg_cmd *cmd; - - /* Don't queue stuff up if we're halted. */ - if (unlikely(card->halt)) - return -EINVAL; - - if (card->creg_ctrl.reset) - return -EAGAIN; - - if (cnt8 > MAX_CREG_DATA8) - return -EINVAL; - - cmd = kmem_cache_alloc(creg_cmd_pool, GFP_KERNEL); - if (!cmd) - return -ENOMEM; - - INIT_LIST_HEAD(&cmd->list); - - cmd->op = op; - cmd->addr = addr; - cmd->cnt8 = cnt8; - cmd->buf = buf; - cmd->stream = stream; - cmd->cb = callback; - cmd->cb_private = cb_private; - cmd->status = 0; - - spin_lock_bh(&card->creg_ctrl.lock); - list_add_tail(&cmd->list, &card->creg_ctrl.queue); - card->creg_ctrl.q_depth++; - creg_kick_queue(card); - spin_unlock_bh(&card->creg_ctrl.lock); - - return 0; -} - -static void creg_cmd_timed_out(struct timer_list *t) -{ - struct rsxx_cardinfo *card = from_timer(card, t, creg_ctrl.cmd_timer); - struct creg_cmd *cmd; - - spin_lock(&card->creg_ctrl.lock); - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - spin_unlock(&card->creg_ctrl.lock); - - if (cmd == NULL) { - card->creg_ctrl.creg_stats.creg_timeout++; - dev_warn(CARD_TO_DEV(card), - "No active command associated with timeout!\n"); - return; - } - - if (cmd->cb) - cmd->cb(card, cmd, -ETIMEDOUT); - - kmem_cache_free(creg_cmd_pool, cmd); - - - spin_lock(&card->creg_ctrl.lock); - card->creg_ctrl.active = 0; - creg_kick_queue(card); - spin_unlock(&card->creg_ctrl.lock); -} - - -static void creg_cmd_done(struct work_struct *work) -{ - struct rsxx_cardinfo *card; - struct creg_cmd *cmd; - int st = 0; - - card = container_of(work, struct rsxx_cardinfo, - creg_ctrl.done_work); - - /* - * The timer could not be cancelled for some reason, - * race to pop the active command. - */ - if (del_timer_sync(&card->creg_ctrl.cmd_timer) == 0) - card->creg_ctrl.creg_stats.failed_cancel_timer++; - - spin_lock_bh(&card->creg_ctrl.lock); - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - spin_unlock_bh(&card->creg_ctrl.lock); - - if (cmd == NULL) { - dev_err(CARD_TO_DEV(card), - "Spurious creg interrupt!\n"); - return; - } - - card->creg_ctrl.creg_stats.stat = ioread32(card->regmap + CREG_STAT); - cmd->status = card->creg_ctrl.creg_stats.stat; - if ((cmd->status & CREG_STAT_STATUS_MASK) == 0) { - dev_err(CARD_TO_DEV(card), - "Invalid status on creg command\n"); - /* - * At this point we're probably reading garbage from HW. Don't - * do anything else that could mess up the system and let - * the sync function return an error. - */ - st = -EIO; - goto creg_done; - } else if (cmd->status & CREG_STAT_ERROR) { - st = -EIO; - } - - if (cmd->op == CREG_OP_READ) { - unsigned int cnt8 = ioread32(card->regmap + CREG_CNT); - - /* Paranoid Sanity Checks */ - if (!cmd->buf) { - dev_err(CARD_TO_DEV(card), - "Buffer not given for read.\n"); - st = -EIO; - goto creg_done; - } - if (cnt8 != cmd->cnt8) { - dev_err(CARD_TO_DEV(card), - "count mismatch\n"); - st = -EIO; - goto creg_done; - } - - st = copy_from_creg_data(card, cnt8, cmd->buf, cmd->stream); - } - -creg_done: - if (cmd->cb) - cmd->cb(card, cmd, st); - - kmem_cache_free(creg_cmd_pool, cmd); - - spin_lock_bh(&card->creg_ctrl.lock); - card->creg_ctrl.active = 0; - creg_kick_queue(card); - spin_unlock_bh(&card->creg_ctrl.lock); -} - -static void creg_reset(struct rsxx_cardinfo *card) -{ - struct creg_cmd *cmd = NULL; - struct creg_cmd *tmp; - unsigned long flags; - - /* - * mutex_trylock is used here because if reset_lock is taken then a - * reset is already happening. So, we can just go ahead and return. - */ - if (!mutex_trylock(&card->creg_ctrl.reset_lock)) - return; - - card->creg_ctrl.reset = 1; - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_disable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT); - spin_unlock_irqrestore(&card->irq_lock, flags); - - dev_warn(CARD_TO_DEV(card), - "Resetting creg interface for recovery\n"); - - /* Cancel outstanding commands */ - spin_lock_bh(&card->creg_ctrl.lock); - list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { - list_del(&cmd->list); - card->creg_ctrl.q_depth--; - if (cmd->cb) - cmd->cb(card, cmd, -ECANCELED); - kmem_cache_free(creg_cmd_pool, cmd); - } - - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - if (cmd) { - if (timer_pending(&card->creg_ctrl.cmd_timer)) - del_timer_sync(&card->creg_ctrl.cmd_timer); - - if (cmd->cb) - cmd->cb(card, cmd, -ECANCELED); - kmem_cache_free(creg_cmd_pool, cmd); - - card->creg_ctrl.active = 0; - } - spin_unlock_bh(&card->creg_ctrl.lock); - - card->creg_ctrl.reset = 0; - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_enable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT); - spin_unlock_irqrestore(&card->irq_lock, flags); - - mutex_unlock(&card->creg_ctrl.reset_lock); -} - -/* Used for synchronous accesses */ -struct creg_completion { - struct completion *cmd_done; - int st; - u32 creg_status; -}; - -static void creg_cmd_done_cb(struct rsxx_cardinfo *card, - struct creg_cmd *cmd, - int st) -{ - struct creg_completion *cmd_completion; - - cmd_completion = cmd->cb_private; - BUG_ON(!cmd_completion); - - cmd_completion->st = st; - cmd_completion->creg_status = cmd->status; - complete(cmd_completion->cmd_done); -} - -static int __issue_creg_rw(struct rsxx_cardinfo *card, - unsigned int op, - unsigned int addr, - unsigned int cnt8, - void *buf, - int stream, - unsigned int *hw_stat) -{ - DECLARE_COMPLETION_ONSTACK(cmd_done); - struct creg_completion completion; - unsigned long timeout; - int st; - - completion.cmd_done = &cmd_done; - completion.st = 0; - completion.creg_status = 0; - - st = creg_queue_cmd(card, op, addr, cnt8, buf, stream, creg_cmd_done_cb, - &completion); - if (st) - return st; - - /* - * This timeout is necessary for unresponsive hardware. The additional - * 20 seconds to used to guarantee that each cregs requests has time to - * complete. - */ - timeout = msecs_to_jiffies(CREG_TIMEOUT_MSEC * - card->creg_ctrl.q_depth + 20000); - - /* - * The creg interface is guaranteed to complete. It has a timeout - * mechanism that will kick in if hardware does not respond. - */ - st = wait_for_completion_timeout(completion.cmd_done, timeout); - if (st == 0) { - /* - * This is really bad, because the kernel timer did not - * expire and notify us of a timeout! - */ - dev_crit(CARD_TO_DEV(card), - "cregs timer failed\n"); - creg_reset(card); - return -EIO; - } - - *hw_stat = completion.creg_status; - - if (completion.st) { - /* - * This read is needed to verify that there has not been any - * extreme errors that might have occurred, i.e. EEH. The - * function iowrite32 will not detect EEH errors, so it is - * necessary that we recover if such an error is the reason - * for the timeout. This is a dummy read. - */ - ioread32(card->regmap + SCRATCH); - - dev_warn(CARD_TO_DEV(card), - "creg command failed(%d x%08x)\n", - completion.st, addr); - return completion.st; - } - - return 0; -} - -static int issue_creg_rw(struct rsxx_cardinfo *card, - u32 addr, - unsigned int size8, - void *data, - int stream, - int read) -{ - unsigned int hw_stat; - unsigned int xfer; - unsigned int op; - int st; - - op = read ? CREG_OP_READ : CREG_OP_WRITE; - - do { - xfer = min_t(unsigned int, size8, MAX_CREG_DATA8); - - st = __issue_creg_rw(card, op, addr, xfer, - data, stream, &hw_stat); - if (st) - return st; - - data = (char *)data + xfer; - addr += xfer; - size8 -= xfer; - } while (size8); - - return 0; -} - -/* ---------------------------- Public API ---------------------------------- */ -int rsxx_creg_write(struct rsxx_cardinfo *card, - u32 addr, - unsigned int size8, - void *data, - int byte_stream) -{ - return issue_creg_rw(card, addr, size8, data, byte_stream, 0); -} - -int rsxx_creg_read(struct rsxx_cardinfo *card, - u32 addr, - unsigned int size8, - void *data, - int byte_stream) -{ - return issue_creg_rw(card, addr, size8, data, byte_stream, 1); -} - -int rsxx_get_card_state(struct rsxx_cardinfo *card, unsigned int *state) -{ - return rsxx_creg_read(card, CREG_ADD_CARD_STATE, - sizeof(*state), state, 0); -} - -int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8) -{ - unsigned int size; - int st; - - st = rsxx_creg_read(card, CREG_ADD_CARD_SIZE, - sizeof(size), &size, 0); - if (st) - return st; - - *size8 = (u64)size * RSXX_HW_BLK_SIZE; - return 0; -} - -int rsxx_get_num_targets(struct rsxx_cardinfo *card, - unsigned int *n_targets) -{ - return rsxx_creg_read(card, CREG_ADD_NUM_TARGETS, - sizeof(*n_targets), n_targets, 0); -} - -int rsxx_get_card_capabilities(struct rsxx_cardinfo *card, - u32 *capabilities) -{ - return rsxx_creg_read(card, CREG_ADD_CAPABILITIES, - sizeof(*capabilities), capabilities, 0); -} - -int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd) -{ - return rsxx_creg_write(card, CREG_ADD_CARD_CMD, - sizeof(cmd), &cmd, 0); -} - - -/*----------------- HW Log Functions -------------------*/ -static void hw_log_msg(struct rsxx_cardinfo *card, const char *str, int len) -{ - static char level; - - /* - * New messages start with "<#>", where # is the log level. Messages - * that extend past the log buffer will use the previous level - */ - if ((len > 3) && (str[0] == '<') && (str[2] == '>')) { - level = str[1]; - str += 3; /* Skip past the log level. */ - len -= 3; - } - - switch (level) { - case '0': - dev_emerg(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '1': - dev_alert(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '2': - dev_crit(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '3': - dev_err(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '4': - dev_warn(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '5': - dev_notice(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '6': - dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - case '7': - dev_dbg(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - default: - dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str); - break; - } -} - -/* - * The substrncpy function copies the src string (which includes the - * terminating '\0' character), up to the count into the dest pointer. - * Returns the number of bytes copied to dest. - */ -static int substrncpy(char *dest, const char *src, int count) -{ - int max_cnt = count; - - while (count) { - count--; - *dest = *src; - if (*dest == '\0') - break; - src++; - dest++; - } - return max_cnt - count; -} - - -static void read_hw_log_done(struct rsxx_cardinfo *card, - struct creg_cmd *cmd, - int st) -{ - char *buf; - char *log_str; - int cnt; - int len; - int off; - - buf = cmd->buf; - off = 0; - - /* Failed getting the log message */ - if (st) - return; - - while (off < cmd->cnt8) { - log_str = &card->log.buf[card->log.buf_len]; - cnt = min(cmd->cnt8 - off, LOG_BUF_SIZE8 - card->log.buf_len); - len = substrncpy(log_str, &buf[off], cnt); - - off += len; - card->log.buf_len += len; - - /* - * Flush the log if we've hit the end of a message or if we've - * run out of buffer space. - */ - if ((log_str[len - 1] == '\0') || - (card->log.buf_len == LOG_BUF_SIZE8)) { - if (card->log.buf_len != 1) /* Don't log blank lines. */ - hw_log_msg(card, card->log.buf, - card->log.buf_len); - card->log.buf_len = 0; - } - - } - - if (cmd->status & CREG_STAT_LOG_PENDING) - rsxx_read_hw_log(card); -} - -int rsxx_read_hw_log(struct rsxx_cardinfo *card) -{ - int st; - - st = creg_queue_cmd(card, CREG_OP_READ, CREG_ADD_LOG, - sizeof(card->log.tmp), card->log.tmp, - 1, read_hw_log_done, NULL); - if (st) - dev_err(CARD_TO_DEV(card), - "Failed getting log text\n"); - - return st; -} - -/*-------------- IOCTL REG Access ------------------*/ -static int issue_reg_cmd(struct rsxx_cardinfo *card, - struct rsxx_reg_access *cmd, - int read) -{ - unsigned int op = read ? CREG_OP_READ : CREG_OP_WRITE; - - return __issue_creg_rw(card, op, cmd->addr, cmd->cnt, cmd->data, - cmd->stream, &cmd->stat); -} - -int rsxx_reg_access(struct rsxx_cardinfo *card, - struct rsxx_reg_access __user *ucmd, - int read) -{ - struct rsxx_reg_access cmd; - int st; - - st = copy_from_user(&cmd, ucmd, sizeof(cmd)); - if (st) - return -EFAULT; - - if (cmd.cnt > RSXX_MAX_REG_CNT) - return -EFAULT; - - st = issue_reg_cmd(card, &cmd, read); - if (st) - return st; - - st = put_user(cmd.stat, &ucmd->stat); - if (st) - return -EFAULT; - - if (read) { - st = copy_to_user(ucmd->data, cmd.data, cmd.cnt); - if (st) - return -EFAULT; - } - - return 0; -} - -void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card) -{ - struct creg_cmd *cmd = NULL; - - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - - if (cmd) { - del_timer_sync(&card->creg_ctrl.cmd_timer); - - spin_lock_bh(&card->creg_ctrl.lock); - list_add(&cmd->list, &card->creg_ctrl.queue); - card->creg_ctrl.q_depth++; - card->creg_ctrl.active = 0; - spin_unlock_bh(&card->creg_ctrl.lock); - } -} - -void rsxx_kick_creg_queue(struct rsxx_cardinfo *card) -{ - spin_lock_bh(&card->creg_ctrl.lock); - if (!list_empty(&card->creg_ctrl.queue)) - creg_kick_queue(card); - spin_unlock_bh(&card->creg_ctrl.lock); -} - -/*------------ Initialization & Setup --------------*/ -int rsxx_creg_setup(struct rsxx_cardinfo *card) -{ - card->creg_ctrl.active_cmd = NULL; - - card->creg_ctrl.creg_wq = - create_singlethread_workqueue(DRIVER_NAME"_creg"); - if (!card->creg_ctrl.creg_wq) - return -ENOMEM; - - INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); - mutex_init(&card->creg_ctrl.reset_lock); - INIT_LIST_HEAD(&card->creg_ctrl.queue); - spin_lock_init(&card->creg_ctrl.lock); - timer_setup(&card->creg_ctrl.cmd_timer, creg_cmd_timed_out, 0); - - return 0; -} - -void rsxx_creg_destroy(struct rsxx_cardinfo *card) -{ - struct creg_cmd *cmd; - struct creg_cmd *tmp; - int cnt = 0; - - /* Cancel outstanding commands */ - spin_lock_bh(&card->creg_ctrl.lock); - list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { - list_del(&cmd->list); - if (cmd->cb) - cmd->cb(card, cmd, -ECANCELED); - kmem_cache_free(creg_cmd_pool, cmd); - cnt++; - } - - if (cnt) - dev_info(CARD_TO_DEV(card), - "Canceled %d queue creg commands\n", cnt); - - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - if (cmd) { - if (timer_pending(&card->creg_ctrl.cmd_timer)) - del_timer_sync(&card->creg_ctrl.cmd_timer); - - if (cmd->cb) - cmd->cb(card, cmd, -ECANCELED); - dev_info(CARD_TO_DEV(card), - "Canceled active creg command\n"); - kmem_cache_free(creg_cmd_pool, cmd); - } - spin_unlock_bh(&card->creg_ctrl.lock); - - cancel_work_sync(&card->creg_ctrl.done_work); -} - - -int rsxx_creg_init(void) -{ - creg_cmd_pool = KMEM_CACHE(creg_cmd, SLAB_HWCACHE_ALIGN); - if (!creg_cmd_pool) - return -ENOMEM; - - return 0; -} - -void rsxx_creg_cleanup(void) -{ - kmem_cache_destroy(creg_cmd_pool); -} diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c deleted file mode 100644 index dd33f1bdf3b8..000000000000 --- a/drivers/block/rsxx/dev.c +++ /dev/null @@ -1,306 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* -* Filename: dev.c -* -* Authors: Joshua Morris -* Philip Kelleher -* -* (C) Copyright 2013 IBM Corporation -*/ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include "rsxx_priv.h" - -static unsigned int blkdev_minors = 64; -module_param(blkdev_minors, uint, 0444); -MODULE_PARM_DESC(blkdev_minors, "Number of minors(partitions)"); - -/* - * For now I'm making this tweakable in case any applications hit this limit. - * If you see a "bio too big" error in the log you will need to raise this - * value. - */ -static unsigned int blkdev_max_hw_sectors = 1024; -module_param(blkdev_max_hw_sectors, uint, 0444); -MODULE_PARM_DESC(blkdev_max_hw_sectors, "Max hw sectors for a single BIO"); - -static unsigned int enable_blkdev = 1; -module_param(enable_blkdev , uint, 0444); -MODULE_PARM_DESC(enable_blkdev, "Enable block device interfaces"); - - -struct rsxx_bio_meta { - struct bio *bio; - atomic_t pending_dmas; - atomic_t error; - unsigned long start_time; -}; - -static struct kmem_cache *bio_meta_pool; - -static void rsxx_submit_bio(struct bio *bio); - -/*----------------- Block Device Operations -----------------*/ -static int rsxx_blkdev_ioctl(struct block_device *bdev, - fmode_t mode, - unsigned int cmd, - unsigned long arg) -{ - struct rsxx_cardinfo *card = bdev->bd_disk->private_data; - - switch (cmd) { - case RSXX_GETREG: - return rsxx_reg_access(card, (void __user *)arg, 1); - case RSXX_SETREG: - return rsxx_reg_access(card, (void __user *)arg, 0); - } - - return -ENOTTY; -} - -static int rsxx_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct rsxx_cardinfo *card = bdev->bd_disk->private_data; - u64 blocks = card->size8 >> 9; - - /* - * get geometry: Fake it. I haven't found any drivers that set - * geo->start, so we won't either. - */ - if (card->size8) { - geo->heads = 64; - geo->sectors = 16; - do_div(blocks, (geo->heads * geo->sectors)); - geo->cylinders = blocks; - } else { - geo->heads = 0; - geo->sectors = 0; - geo->cylinders = 0; - } - return 0; -} - -static const struct block_device_operations rsxx_fops = { - .owner = THIS_MODULE, - .submit_bio = rsxx_submit_bio, - .getgeo = rsxx_getgeo, - .ioctl = rsxx_blkdev_ioctl, -}; - -static void bio_dma_done_cb(struct rsxx_cardinfo *card, - void *cb_data, - unsigned int error) -{ - struct rsxx_bio_meta *meta = cb_data; - - if (error) - atomic_set(&meta->error, 1); - - if (atomic_dec_and_test(&meta->pending_dmas)) { - if (!card->eeh_state && card->gendisk) - bio_end_io_acct(meta->bio, meta->start_time); - - if (atomic_read(&meta->error)) - bio_io_error(meta->bio); - else - bio_endio(meta->bio); - kmem_cache_free(bio_meta_pool, meta); - } -} - -static void rsxx_submit_bio(struct bio *bio) -{ - struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data; - struct rsxx_bio_meta *bio_meta; - blk_status_t st = BLK_STS_IOERR; - - blk_queue_split(&bio); - - might_sleep(); - - if (!card) - goto req_err; - - if (bio_end_sector(bio) > get_capacity(card->gendisk)) - goto req_err; - - if (unlikely(card->halt)) - goto req_err; - - if (unlikely(card->dma_fault)) - goto req_err; - - if (bio->bi_iter.bi_size == 0) { - dev_err(CARD_TO_DEV(card), "size zero BIO!\n"); - goto req_err; - } - - bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL); - if (!bio_meta) { - st = BLK_STS_RESOURCE; - goto req_err; - } - - bio_meta->bio = bio; - atomic_set(&bio_meta->error, 0); - atomic_set(&bio_meta->pending_dmas, 0); - - if (!unlikely(card->halt)) - bio_meta->start_time = bio_start_io_acct(bio); - - dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", - bio_data_dir(bio) ? 'W' : 'R', bio_meta, - (u64)bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size); - - st = rsxx_dma_queue_bio(card, bio, &bio_meta->pending_dmas, - bio_dma_done_cb, bio_meta); - if (st) - goto queue_err; - - return; - -queue_err: - kmem_cache_free(bio_meta_pool, bio_meta); -req_err: - if (st) - bio->bi_status = st; - bio_endio(bio); -} - -/*----------------- Device Setup -------------------*/ -static bool rsxx_discard_supported(struct rsxx_cardinfo *card) -{ - unsigned char pci_rev; - - pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev); - - return (pci_rev >= RSXX_DISCARD_SUPPORT); -} - -int rsxx_attach_dev(struct rsxx_cardinfo *card) -{ - int err = 0; - - mutex_lock(&card->dev_lock); - - /* The block device requires the stripe size from the config. */ - if (enable_blkdev) { - if (card->config_valid) - set_capacity(card->gendisk, card->size8 >> 9); - else - set_capacity(card->gendisk, 0); - err = device_add_disk(CARD_TO_DEV(card), card->gendisk, NULL); - if (err == 0) - card->bdev_attached = 1; - } - - mutex_unlock(&card->dev_lock); - - if (err) - blk_cleanup_disk(card->gendisk); - - return err; -} - -void rsxx_detach_dev(struct rsxx_cardinfo *card) -{ - mutex_lock(&card->dev_lock); - - if (card->bdev_attached) { - del_gendisk(card->gendisk); - card->bdev_attached = 0; - } - - mutex_unlock(&card->dev_lock); -} - -int rsxx_setup_dev(struct rsxx_cardinfo *card) -{ - unsigned short blk_size; - - mutex_init(&card->dev_lock); - - if (!enable_blkdev) - return 0; - - card->major = register_blkdev(0, DRIVER_NAME); - if (card->major < 0) { - dev_err(CARD_TO_DEV(card), "Failed to get major number\n"); - return -ENOMEM; - } - - card->gendisk = blk_alloc_disk(blkdev_minors); - if (!card->gendisk) { - dev_err(CARD_TO_DEV(card), "Failed disk alloc\n"); - unregister_blkdev(card->major, DRIVER_NAME); - return -ENOMEM; - } - - if (card->config_valid) { - blk_size = card->config.data.block_size; - blk_queue_dma_alignment(card->gendisk->queue, blk_size - 1); - blk_queue_logical_block_size(card->gendisk->queue, blk_size); - } - - blk_queue_max_hw_sectors(card->gendisk->queue, blkdev_max_hw_sectors); - blk_queue_physical_block_size(card->gendisk->queue, RSXX_HW_BLK_SIZE); - - blk_queue_flag_set(QUEUE_FLAG_NONROT, card->gendisk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, card->gendisk->queue); - if (rsxx_discard_supported(card)) { - blk_queue_flag_set(QUEUE_FLAG_DISCARD, card->gendisk->queue); - blk_queue_max_discard_sectors(card->gendisk->queue, - RSXX_HW_BLK_SIZE >> 9); - card->gendisk->queue->limits.discard_granularity = - RSXX_HW_BLK_SIZE; - card->gendisk->queue->limits.discard_alignment = - RSXX_HW_BLK_SIZE; - } - - snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name), - "rsxx%d", card->disk_id); - card->gendisk->major = card->major; - card->gendisk->minors = blkdev_minors; - card->gendisk->fops = &rsxx_fops; - card->gendisk->private_data = card; - - return 0; -} - -void rsxx_destroy_dev(struct rsxx_cardinfo *card) -{ - if (!enable_blkdev) - return; - - blk_cleanup_disk(card->gendisk); - card->gendisk = NULL; - unregister_blkdev(card->major, DRIVER_NAME); -} - -int rsxx_dev_init(void) -{ - bio_meta_pool = KMEM_CACHE(rsxx_bio_meta, SLAB_HWCACHE_ALIGN); - if (!bio_meta_pool) - return -ENOMEM; - - return 0; -} - -void rsxx_dev_cleanup(void) -{ - kmem_cache_destroy(bio_meta_pool); -} - - diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c deleted file mode 100644 index ed182f3dd054..000000000000 --- a/drivers/block/rsxx/dma.c +++ /dev/null @@ -1,1085 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* -* Filename: dma.c -* -* Authors: Joshua Morris -* Philip Kelleher -* -* (C) Copyright 2013 IBM Corporation -*/ - -#include -#include "rsxx_priv.h" - -struct rsxx_dma { - struct list_head list; - u8 cmd; - unsigned int laddr; /* Logical address */ - struct { - u32 off; - u32 cnt; - } sub_page; - dma_addr_t dma_addr; - struct page *page; - unsigned int pg_off; /* Page Offset */ - rsxx_dma_cb cb; - void *cb_data; -}; - -/* This timeout is used to detect a stalled DMA channel */ -#define DMA_ACTIVITY_TIMEOUT msecs_to_jiffies(10000) - -struct hw_status { - u8 status; - u8 tag; - __le16 count; - __le32 _rsvd2; - __le64 _rsvd3; -} __packed; - -enum rsxx_dma_status { - DMA_SW_ERR = 0x1, - DMA_HW_FAULT = 0x2, - DMA_CANCELLED = 0x4, -}; - -struct hw_cmd { - u8 command; - u8 tag; - u8 _rsvd; - u8 sub_page; /* Bit[0:2]: 512byte offset */ - /* Bit[4:6]: 512byte count */ - __le32 device_addr; - __le64 host_addr; -} __packed; - -enum rsxx_hw_cmd { - HW_CMD_BLK_DISCARD = 0x70, - HW_CMD_BLK_WRITE = 0x80, - HW_CMD_BLK_READ = 0xC0, - HW_CMD_BLK_RECON_READ = 0xE0, -}; - -enum rsxx_hw_status { - HW_STATUS_CRC = 0x01, - HW_STATUS_HARD_ERR = 0x02, - HW_STATUS_SOFT_ERR = 0x04, - HW_STATUS_FAULT = 0x08, -}; - -static struct kmem_cache *rsxx_dma_pool; - -struct dma_tracker { - int next_tag; - struct rsxx_dma *dma; -}; - -struct dma_tracker_list { - spinlock_t lock; - int head; - struct dma_tracker list[]; -}; - - -/*----------------- Misc Utility Functions -------------------*/ -static unsigned int rsxx_addr8_to_laddr(u64 addr8, struct rsxx_cardinfo *card) -{ - unsigned long long tgt_addr8; - - tgt_addr8 = ((addr8 >> card->_stripe.upper_shift) & - card->_stripe.upper_mask) | - ((addr8) & card->_stripe.lower_mask); - do_div(tgt_addr8, RSXX_HW_BLK_SIZE); - return tgt_addr8; -} - -static unsigned int rsxx_get_dma_tgt(struct rsxx_cardinfo *card, u64 addr8) -{ - unsigned int tgt; - - tgt = (addr8 >> card->_stripe.target_shift) & card->_stripe.target_mask; - - return tgt; -} - -void rsxx_dma_queue_reset(struct rsxx_cardinfo *card) -{ - /* Reset all DMA Command/Status Queues */ - iowrite32(DMA_QUEUE_RESET, card->regmap + RESET); -} - -static unsigned int get_dma_size(struct rsxx_dma *dma) -{ - if (dma->sub_page.cnt) - return dma->sub_page.cnt << 9; - else - return RSXX_HW_BLK_SIZE; -} - - -/*----------------- DMA Tracker -------------------*/ -static void set_tracker_dma(struct dma_tracker_list *trackers, - int tag, - struct rsxx_dma *dma) -{ - trackers->list[tag].dma = dma; -} - -static struct rsxx_dma *get_tracker_dma(struct dma_tracker_list *trackers, - int tag) -{ - return trackers->list[tag].dma; -} - -static int pop_tracker(struct dma_tracker_list *trackers) -{ - int tag; - - spin_lock(&trackers->lock); - tag = trackers->head; - if (tag != -1) { - trackers->head = trackers->list[tag].next_tag; - trackers->list[tag].next_tag = -1; - } - spin_unlock(&trackers->lock); - - return tag; -} - -static void push_tracker(struct dma_tracker_list *trackers, int tag) -{ - spin_lock(&trackers->lock); - trackers->list[tag].next_tag = trackers->head; - trackers->head = tag; - trackers->list[tag].dma = NULL; - spin_unlock(&trackers->lock); -} - - -/*----------------- Interrupt Coalescing -------------*/ -/* - * Interrupt Coalescing Register Format: - * Interrupt Timer (64ns units) [15:0] - * Interrupt Count [24:16] - * Reserved [31:25] -*/ -#define INTR_COAL_LATENCY_MASK (0x0000ffff) - -#define INTR_COAL_COUNT_SHIFT 16 -#define INTR_COAL_COUNT_BITS 9 -#define INTR_COAL_COUNT_MASK (((1 << INTR_COAL_COUNT_BITS) - 1) << \ - INTR_COAL_COUNT_SHIFT) -#define INTR_COAL_LATENCY_UNITS_NS 64 - - -static u32 dma_intr_coal_val(u32 mode, u32 count, u32 latency) -{ - u32 latency_units = latency / INTR_COAL_LATENCY_UNITS_NS; - - if (mode == RSXX_INTR_COAL_DISABLED) - return 0; - - return ((count << INTR_COAL_COUNT_SHIFT) & INTR_COAL_COUNT_MASK) | - (latency_units & INTR_COAL_LATENCY_MASK); - -} - -static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card) -{ - int i; - u32 q_depth = 0; - u32 intr_coal; - - if (card->config.data.intr_coal.mode != RSXX_INTR_COAL_AUTO_TUNE || - unlikely(card->eeh_state)) - return; - - for (i = 0; i < card->n_targets; i++) - q_depth += atomic_read(&card->ctrl[i].stats.hw_q_depth); - - intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode, - q_depth / 2, - card->config.data.intr_coal.latency); - iowrite32(intr_coal, card->regmap + INTR_COAL); -} - -/*----------------- RSXX DMA Handling -------------------*/ -static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma) -{ - if (dma->cmd != HW_CMD_BLK_DISCARD) { - if (!dma_mapping_error(&ctrl->card->dev->dev, dma->dma_addr)) { - dma_unmap_page(&ctrl->card->dev->dev, dma->dma_addr, - get_dma_size(dma), - dma->cmd == HW_CMD_BLK_WRITE ? - DMA_TO_DEVICE : - DMA_FROM_DEVICE); - } - } - - kmem_cache_free(rsxx_dma_pool, dma); -} - -static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, - struct rsxx_dma *dma, - unsigned int status) -{ - if (status & DMA_SW_ERR) - ctrl->stats.dma_sw_err++; - if (status & DMA_HW_FAULT) - ctrl->stats.dma_hw_fault++; - if (status & DMA_CANCELLED) - ctrl->stats.dma_cancelled++; - - if (dma->cb) - dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0); - - rsxx_free_dma(ctrl, dma); -} - -int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, - struct list_head *q, unsigned int done) -{ - struct rsxx_dma *dma; - struct rsxx_dma *tmp; - int cnt = 0; - - list_for_each_entry_safe(dma, tmp, q, list) { - list_del(&dma->list); - if (done & COMPLETE_DMA) - rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); - else - rsxx_free_dma(ctrl, dma); - cnt++; - } - - return cnt; -} - -static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, - struct rsxx_dma *dma) -{ - /* - * Requeued DMAs go to the front of the queue so they are issued - * first. - */ - spin_lock_bh(&ctrl->queue_lock); - ctrl->stats.sw_q_depth++; - list_add(&dma->list, &ctrl->queue); - spin_unlock_bh(&ctrl->queue_lock); -} - -static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, - struct rsxx_dma *dma, - u8 hw_st) -{ - unsigned int status = 0; - int requeue_cmd = 0; - - dev_dbg(CARD_TO_DEV(ctrl->card), - "Handling DMA error(cmd x%02x, laddr x%08x st:x%02x)\n", - dma->cmd, dma->laddr, hw_st); - - if (hw_st & HW_STATUS_CRC) - ctrl->stats.crc_errors++; - if (hw_st & HW_STATUS_HARD_ERR) - ctrl->stats.hard_errors++; - if (hw_st & HW_STATUS_SOFT_ERR) - ctrl->stats.soft_errors++; - - switch (dma->cmd) { - case HW_CMD_BLK_READ: - if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) { - if (ctrl->card->scrub_hard) { - dma->cmd = HW_CMD_BLK_RECON_READ; - requeue_cmd = 1; - ctrl->stats.reads_retried++; - } else { - status |= DMA_HW_FAULT; - ctrl->stats.reads_failed++; - } - } else if (hw_st & HW_STATUS_FAULT) { - status |= DMA_HW_FAULT; - ctrl->stats.reads_failed++; - } - - break; - case HW_CMD_BLK_RECON_READ: - if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) { - /* Data could not be reconstructed. */ - status |= DMA_HW_FAULT; - ctrl->stats.reads_failed++; - } - - break; - case HW_CMD_BLK_WRITE: - status |= DMA_HW_FAULT; - ctrl->stats.writes_failed++; - - break; - case HW_CMD_BLK_DISCARD: - status |= DMA_HW_FAULT; - ctrl->stats.discards_failed++; - - break; - default: - dev_err(CARD_TO_DEV(ctrl->card), - "Unknown command in DMA!(cmd: x%02x " - "laddr x%08x st: x%02x\n", - dma->cmd, dma->laddr, hw_st); - status |= DMA_SW_ERR; - - break; - } - - if (requeue_cmd) - rsxx_requeue_dma(ctrl, dma); - else - rsxx_complete_dma(ctrl, dma, status); -} - -static void dma_engine_stalled(struct timer_list *t) -{ - struct rsxx_dma_ctrl *ctrl = from_timer(ctrl, t, activity_timer); - int cnt; - - if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || - unlikely(ctrl->card->eeh_state)) - return; - - if (ctrl->cmd.idx != ioread32(ctrl->regmap + SW_CMD_IDX)) { - /* - * The dma engine was stalled because the SW_CMD_IDX write - * was lost. Issue it again to recover. - */ - dev_warn(CARD_TO_DEV(ctrl->card), - "SW_CMD_IDX write was lost, re-writing...\n"); - iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); - mod_timer(&ctrl->activity_timer, - jiffies + DMA_ACTIVITY_TIMEOUT); - } else { - dev_warn(CARD_TO_DEV(ctrl->card), - "DMA channel %d has stalled, faulting interface.\n", - ctrl->id); - ctrl->card->dma_fault = 1; - - /* Clean up the DMA queue */ - spin_lock(&ctrl->queue_lock); - cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA); - spin_unlock(&ctrl->queue_lock); - - cnt += rsxx_dma_cancel(ctrl); - - if (cnt) - dev_info(CARD_TO_DEV(ctrl->card), - "Freed %d queued DMAs on channel %d\n", - cnt, ctrl->id); - } -} - -static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl) -{ - struct rsxx_dma *dma; - int tag; - int cmds_pending = 0; - struct hw_cmd *hw_cmd_buf; - int dir; - - hw_cmd_buf = ctrl->cmd.buf; - - if (unlikely(ctrl->card->halt) || - unlikely(ctrl->card->eeh_state)) - return; - - while (1) { - spin_lock_bh(&ctrl->queue_lock); - if (list_empty(&ctrl->queue)) { - spin_unlock_bh(&ctrl->queue_lock); - break; - } - spin_unlock_bh(&ctrl->queue_lock); - - tag = pop_tracker(ctrl->trackers); - if (tag == -1) - break; - - spin_lock_bh(&ctrl->queue_lock); - dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); - list_del(&dma->list); - ctrl->stats.sw_q_depth--; - spin_unlock_bh(&ctrl->queue_lock); - - /* - * This will catch any DMAs that slipped in right before the - * fault, but was queued after all the other DMAs were - * cancelled. - */ - if (unlikely(ctrl->card->dma_fault)) { - push_tracker(ctrl->trackers, tag); - rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); - continue; - } - - if (dma->cmd != HW_CMD_BLK_DISCARD) { - if (dma->cmd == HW_CMD_BLK_WRITE) - dir = DMA_TO_DEVICE; - else - dir = DMA_FROM_DEVICE; - - /* - * The function dma_map_page is placed here because we - * can only, by design, issue up to 255 commands to the - * hardware at one time per DMA channel. So the maximum - * amount of mapped memory would be 255 * 4 channels * - * 4096 Bytes which is less than 2GB, the limit of a x8 - * Non-HWWD PCIe slot. This way the dma_map_page - * function should never fail because of a lack of - * mappable memory. - */ - dma->dma_addr = dma_map_page(&ctrl->card->dev->dev, dma->page, - dma->pg_off, dma->sub_page.cnt << 9, dir); - if (dma_mapping_error(&ctrl->card->dev->dev, dma->dma_addr)) { - push_tracker(ctrl->trackers, tag); - rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); - continue; - } - } - - set_tracker_dma(ctrl->trackers, tag, dma); - hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd; - hw_cmd_buf[ctrl->cmd.idx].tag = tag; - hw_cmd_buf[ctrl->cmd.idx]._rsvd = 0; - hw_cmd_buf[ctrl->cmd.idx].sub_page = - ((dma->sub_page.cnt & 0x7) << 4) | - (dma->sub_page.off & 0x7); - - hw_cmd_buf[ctrl->cmd.idx].device_addr = - cpu_to_le32(dma->laddr); - - hw_cmd_buf[ctrl->cmd.idx].host_addr = - cpu_to_le64(dma->dma_addr); - - dev_dbg(CARD_TO_DEV(ctrl->card), - "Issue DMA%d(laddr %d tag %d) to idx %d\n", - ctrl->id, dma->laddr, tag, ctrl->cmd.idx); - - ctrl->cmd.idx = (ctrl->cmd.idx + 1) & RSXX_CS_IDX_MASK; - cmds_pending++; - - if (dma->cmd == HW_CMD_BLK_WRITE) - ctrl->stats.writes_issued++; - else if (dma->cmd == HW_CMD_BLK_DISCARD) - ctrl->stats.discards_issued++; - else - ctrl->stats.reads_issued++; - } - - /* Let HW know we've queued commands. */ - if (cmds_pending) { - atomic_add(cmds_pending, &ctrl->stats.hw_q_depth); - mod_timer(&ctrl->activity_timer, - jiffies + DMA_ACTIVITY_TIMEOUT); - - if (unlikely(ctrl->card->eeh_state)) { - del_timer_sync(&ctrl->activity_timer); - return; - } - - iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); - } -} - -static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl) -{ - struct rsxx_dma *dma; - unsigned long flags; - u16 count; - u8 status; - u8 tag; - struct hw_status *hw_st_buf; - - hw_st_buf = ctrl->status.buf; - - if (unlikely(ctrl->card->halt) || - unlikely(ctrl->card->dma_fault) || - unlikely(ctrl->card->eeh_state)) - return; - - count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); - - while (count == ctrl->e_cnt) { - /* - * The read memory-barrier is necessary to keep aggressive - * processors/optimizers (such as the PPC Apple G5) from - * reordering the following status-buffer tag & status read - * *before* the count read on subsequent iterations of the - * loop! - */ - rmb(); - - status = hw_st_buf[ctrl->status.idx].status; - tag = hw_st_buf[ctrl->status.idx].tag; - - dma = get_tracker_dma(ctrl->trackers, tag); - if (dma == NULL) { - spin_lock_irqsave(&ctrl->card->irq_lock, flags); - rsxx_disable_ier(ctrl->card, CR_INTR_DMA_ALL); - spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); - - dev_err(CARD_TO_DEV(ctrl->card), - "No tracker for tag %d " - "(idx %d id %d)\n", - tag, ctrl->status.idx, ctrl->id); - return; - } - - dev_dbg(CARD_TO_DEV(ctrl->card), - "Completing DMA%d" - "(laddr x%x tag %d st: x%x cnt: x%04x) from idx %d.\n", - ctrl->id, dma->laddr, tag, status, count, - ctrl->status.idx); - - atomic_dec(&ctrl->stats.hw_q_depth); - - mod_timer(&ctrl->activity_timer, - jiffies + DMA_ACTIVITY_TIMEOUT); - - if (status) - rsxx_handle_dma_error(ctrl, dma, status); - else - rsxx_complete_dma(ctrl, dma, 0); - - push_tracker(ctrl->trackers, tag); - - ctrl->status.idx = (ctrl->status.idx + 1) & - RSXX_CS_IDX_MASK; - ctrl->e_cnt++; - - count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); - } - - dma_intr_coal_auto_tune(ctrl->card); - - if (atomic_read(&ctrl->stats.hw_q_depth) == 0) - del_timer_sync(&ctrl->activity_timer); - - spin_lock_irqsave(&ctrl->card->irq_lock, flags); - rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); - spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); - - spin_lock_bh(&ctrl->queue_lock); - if (ctrl->stats.sw_q_depth) - queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); - spin_unlock_bh(&ctrl->queue_lock); -} - -static void rsxx_schedule_issue(struct work_struct *work) -{ - struct rsxx_dma_ctrl *ctrl; - - ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); - - mutex_lock(&ctrl->work_lock); - rsxx_issue_dmas(ctrl); - mutex_unlock(&ctrl->work_lock); -} - -static void rsxx_schedule_done(struct work_struct *work) -{ - struct rsxx_dma_ctrl *ctrl; - - ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); - - mutex_lock(&ctrl->work_lock); - rsxx_dma_done(ctrl); - mutex_unlock(&ctrl->work_lock); -} - -static blk_status_t rsxx_queue_discard(struct rsxx_cardinfo *card, - struct list_head *q, - unsigned int laddr, - rsxx_dma_cb cb, - void *cb_data) -{ - struct rsxx_dma *dma; - - dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); - if (!dma) - return BLK_STS_RESOURCE; - - dma->cmd = HW_CMD_BLK_DISCARD; - dma->laddr = laddr; - dma->dma_addr = 0; - dma->sub_page.off = 0; - dma->sub_page.cnt = 0; - dma->page = NULL; - dma->pg_off = 0; - dma->cb = cb; - dma->cb_data = cb_data; - - dev_dbg(CARD_TO_DEV(card), "Queuing[D] laddr %x\n", dma->laddr); - - list_add_tail(&dma->list, q); - - return 0; -} - -static blk_status_t rsxx_queue_dma(struct rsxx_cardinfo *card, - struct list_head *q, - int dir, - unsigned int dma_off, - unsigned int dma_len, - unsigned int laddr, - struct page *page, - unsigned int pg_off, - rsxx_dma_cb cb, - void *cb_data) -{ - struct rsxx_dma *dma; - - dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); - if (!dma) - return BLK_STS_RESOURCE; - - dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; - dma->laddr = laddr; - dma->sub_page.off = (dma_off >> 9); - dma->sub_page.cnt = (dma_len >> 9); - dma->page = page; - dma->pg_off = pg_off; - dma->cb = cb; - dma->cb_data = cb_data; - - dev_dbg(CARD_TO_DEV(card), - "Queuing[%c] laddr %x off %d cnt %d page %p pg_off %d\n", - dir ? 'W' : 'R', dma->laddr, dma->sub_page.off, - dma->sub_page.cnt, dma->page, dma->pg_off); - - /* Queue the DMA */ - list_add_tail(&dma->list, q); - - return 0; -} - -blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card, - struct bio *bio, - atomic_t *n_dmas, - rsxx_dma_cb cb, - void *cb_data) -{ - struct list_head dma_list[RSXX_MAX_TARGETS]; - struct bio_vec bvec; - struct bvec_iter iter; - unsigned long long addr8; - unsigned int laddr; - unsigned int bv_len; - unsigned int bv_off; - unsigned int dma_off; - unsigned int dma_len; - int dma_cnt[RSXX_MAX_TARGETS]; - int tgt; - blk_status_t st; - int i; - - addr8 = bio->bi_iter.bi_sector << 9; /* sectors are 512 bytes */ - atomic_set(n_dmas, 0); - - for (i = 0; i < card->n_targets; i++) { - INIT_LIST_HEAD(&dma_list[i]); - dma_cnt[i] = 0; - } - - if (bio_op(bio) == REQ_OP_DISCARD) { - bv_len = bio->bi_iter.bi_size; - - while (bv_len > 0) { - tgt = rsxx_get_dma_tgt(card, addr8); - laddr = rsxx_addr8_to_laddr(addr8, card); - - st = rsxx_queue_discard(card, &dma_list[tgt], laddr, - cb, cb_data); - if (st) - goto bvec_err; - - dma_cnt[tgt]++; - atomic_inc(n_dmas); - addr8 += RSXX_HW_BLK_SIZE; - bv_len -= RSXX_HW_BLK_SIZE; - } - } else { - bio_for_each_segment(bvec, bio, iter) { - bv_len = bvec.bv_len; - bv_off = bvec.bv_offset; - - while (bv_len > 0) { - tgt = rsxx_get_dma_tgt(card, addr8); - laddr = rsxx_addr8_to_laddr(addr8, card); - dma_off = addr8 & RSXX_HW_BLK_MASK; - dma_len = min(bv_len, - RSXX_HW_BLK_SIZE - dma_off); - - st = rsxx_queue_dma(card, &dma_list[tgt], - bio_data_dir(bio), - dma_off, dma_len, - laddr, bvec.bv_page, - bv_off, cb, cb_data); - if (st) - goto bvec_err; - - dma_cnt[tgt]++; - atomic_inc(n_dmas); - addr8 += dma_len; - bv_off += dma_len; - bv_len -= dma_len; - } - } - } - - for (i = 0; i < card->n_targets; i++) { - if (!list_empty(&dma_list[i])) { - spin_lock_bh(&card->ctrl[i].queue_lock); - card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; - list_splice_tail(&dma_list[i], &card->ctrl[i].queue); - spin_unlock_bh(&card->ctrl[i].queue_lock); - - queue_work(card->ctrl[i].issue_wq, - &card->ctrl[i].issue_dma_work); - } - } - - return 0; - -bvec_err: - for (i = 0; i < card->n_targets; i++) - rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i], - FREE_DMA); - return st; -} - - -/*----------------- DMA Engine Initialization & Setup -------------------*/ -int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl) -{ - ctrl->status.buf = dma_alloc_coherent(&dev->dev, STATUS_BUFFER_SIZE8, - &ctrl->status.dma_addr, GFP_KERNEL); - ctrl->cmd.buf = dma_alloc_coherent(&dev->dev, COMMAND_BUFFER_SIZE8, - &ctrl->cmd.dma_addr, GFP_KERNEL); - if (ctrl->status.buf == NULL || ctrl->cmd.buf == NULL) - return -ENOMEM; - - memset(ctrl->status.buf, 0xac, STATUS_BUFFER_SIZE8); - iowrite32(lower_32_bits(ctrl->status.dma_addr), - ctrl->regmap + SB_ADD_LO); - iowrite32(upper_32_bits(ctrl->status.dma_addr), - ctrl->regmap + SB_ADD_HI); - - memset(ctrl->cmd.buf, 0x83, COMMAND_BUFFER_SIZE8); - iowrite32(lower_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_LO); - iowrite32(upper_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_HI); - - ctrl->status.idx = ioread32(ctrl->regmap + HW_STATUS_CNT); - if (ctrl->status.idx > RSXX_MAX_OUTSTANDING_CMDS) { - dev_crit(&dev->dev, "Failed reading status cnt x%x\n", - ctrl->status.idx); - return -EINVAL; - } - iowrite32(ctrl->status.idx, ctrl->regmap + HW_STATUS_CNT); - iowrite32(ctrl->status.idx, ctrl->regmap + SW_STATUS_CNT); - - ctrl->cmd.idx = ioread32(ctrl->regmap + HW_CMD_IDX); - if (ctrl->cmd.idx > RSXX_MAX_OUTSTANDING_CMDS) { - dev_crit(&dev->dev, "Failed reading cmd cnt x%x\n", - ctrl->status.idx); - return -EINVAL; - } - iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX); - iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); - - return 0; -} - -static int rsxx_dma_ctrl_init(struct pci_dev *dev, - struct rsxx_dma_ctrl *ctrl) -{ - int i; - int st; - - memset(&ctrl->stats, 0, sizeof(ctrl->stats)); - - ctrl->trackers = vmalloc(struct_size(ctrl->trackers, list, - RSXX_MAX_OUTSTANDING_CMDS)); - if (!ctrl->trackers) - return -ENOMEM; - - ctrl->trackers->head = 0; - for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { - ctrl->trackers->list[i].next_tag = i + 1; - ctrl->trackers->list[i].dma = NULL; - } - ctrl->trackers->list[RSXX_MAX_OUTSTANDING_CMDS-1].next_tag = -1; - spin_lock_init(&ctrl->trackers->lock); - - spin_lock_init(&ctrl->queue_lock); - mutex_init(&ctrl->work_lock); - INIT_LIST_HEAD(&ctrl->queue); - - timer_setup(&ctrl->activity_timer, dma_engine_stalled, 0); - - ctrl->issue_wq = alloc_ordered_workqueue(DRIVER_NAME"_issue", 0); - if (!ctrl->issue_wq) - return -ENOMEM; - - ctrl->done_wq = alloc_ordered_workqueue(DRIVER_NAME"_done", 0); - if (!ctrl->done_wq) - return -ENOMEM; - - INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue); - INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done); - - st = rsxx_hw_buffers_init(dev, ctrl); - if (st) - return st; - - return 0; -} - -static int rsxx_dma_stripe_setup(struct rsxx_cardinfo *card, - unsigned int stripe_size8) -{ - if (!is_power_of_2(stripe_size8)) { - dev_err(CARD_TO_DEV(card), - "stripe_size is NOT a power of 2!\n"); - return -EINVAL; - } - - card->_stripe.lower_mask = stripe_size8 - 1; - - card->_stripe.upper_mask = ~(card->_stripe.lower_mask); - card->_stripe.upper_shift = ffs(card->n_targets) - 1; - - card->_stripe.target_mask = card->n_targets - 1; - card->_stripe.target_shift = ffs(stripe_size8) - 1; - - dev_dbg(CARD_TO_DEV(card), "_stripe.lower_mask = x%016llx\n", - card->_stripe.lower_mask); - dev_dbg(CARD_TO_DEV(card), "_stripe.upper_shift = x%016llx\n", - card->_stripe.upper_shift); - dev_dbg(CARD_TO_DEV(card), "_stripe.upper_mask = x%016llx\n", - card->_stripe.upper_mask); - dev_dbg(CARD_TO_DEV(card), "_stripe.target_mask = x%016llx\n", - card->_stripe.target_mask); - dev_dbg(CARD_TO_DEV(card), "_stripe.target_shift = x%016llx\n", - card->_stripe.target_shift); - - return 0; -} - -int rsxx_dma_configure(struct rsxx_cardinfo *card) -{ - u32 intr_coal; - - intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode, - card->config.data.intr_coal.count, - card->config.data.intr_coal.latency); - iowrite32(intr_coal, card->regmap + INTR_COAL); - - return rsxx_dma_stripe_setup(card, card->config.data.stripe_size); -} - -int rsxx_dma_setup(struct rsxx_cardinfo *card) -{ - unsigned long flags; - int st; - int i; - - dev_info(CARD_TO_DEV(card), - "Initializing %d DMA targets\n", - card->n_targets); - - /* Regmap is divided up into 4K chunks. One for each DMA channel */ - for (i = 0; i < card->n_targets; i++) - card->ctrl[i].regmap = card->regmap + (i * 4096); - - card->dma_fault = 0; - - /* Reset the DMA queues */ - rsxx_dma_queue_reset(card); - - /************* Setup DMA Control *************/ - for (i = 0; i < card->n_targets; i++) { - st = rsxx_dma_ctrl_init(card->dev, &card->ctrl[i]); - if (st) - goto failed_dma_setup; - - card->ctrl[i].card = card; - card->ctrl[i].id = i; - } - - card->scrub_hard = 1; - - if (card->config_valid) - rsxx_dma_configure(card); - - /* Enable the interrupts after all setup has completed. */ - for (i = 0; i < card->n_targets; i++) { - spin_lock_irqsave(&card->irq_lock, flags); - rsxx_enable_ier_and_isr(card, CR_INTR_DMA(i)); - spin_unlock_irqrestore(&card->irq_lock, flags); - } - - return 0; - -failed_dma_setup: - for (i = 0; i < card->n_targets; i++) { - struct rsxx_dma_ctrl *ctrl = &card->ctrl[i]; - - if (ctrl->issue_wq) { - destroy_workqueue(ctrl->issue_wq); - ctrl->issue_wq = NULL; - } - - if (ctrl->done_wq) { - destroy_workqueue(ctrl->done_wq); - ctrl->done_wq = NULL; - } - - vfree(ctrl->trackers); - - if (ctrl->status.buf) - dma_free_coherent(&card->dev->dev, STATUS_BUFFER_SIZE8, - ctrl->status.buf, - ctrl->status.dma_addr); - if (ctrl->cmd.buf) - dma_free_coherent(&card->dev->dev, COMMAND_BUFFER_SIZE8, - ctrl->cmd.buf, ctrl->cmd.dma_addr); - } - - return st; -} - -int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl) -{ - struct rsxx_dma *dma; - int i; - int cnt = 0; - - /* Clean up issued DMAs */ - for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { - dma = get_tracker_dma(ctrl->trackers, i); - if (dma) { - atomic_dec(&ctrl->stats.hw_q_depth); - rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); - push_tracker(ctrl->trackers, i); - cnt++; - } - } - - return cnt; -} - -void rsxx_dma_destroy(struct rsxx_cardinfo *card) -{ - struct rsxx_dma_ctrl *ctrl; - int i; - - for (i = 0; i < card->n_targets; i++) { - ctrl = &card->ctrl[i]; - - if (ctrl->issue_wq) { - destroy_workqueue(ctrl->issue_wq); - ctrl->issue_wq = NULL; - } - - if (ctrl->done_wq) { - destroy_workqueue(ctrl->done_wq); - ctrl->done_wq = NULL; - } - - if (timer_pending(&ctrl->activity_timer)) - del_timer_sync(&ctrl->activity_timer); - - /* Clean up the DMA queue */ - spin_lock_bh(&ctrl->queue_lock); - rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA); - spin_unlock_bh(&ctrl->queue_lock); - - rsxx_dma_cancel(ctrl); - - vfree(ctrl->trackers); - - dma_free_coherent(&card->dev->dev, STATUS_BUFFER_SIZE8, - ctrl->status.buf, ctrl->status.dma_addr); - dma_free_coherent(&card->dev->dev, COMMAND_BUFFER_SIZE8, - ctrl->cmd.buf, ctrl->cmd.dma_addr); - } -} - -int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) -{ - int i; - int j; - int cnt; - struct rsxx_dma *dma; - struct list_head *issued_dmas; - - issued_dmas = kcalloc(card->n_targets, sizeof(*issued_dmas), - GFP_KERNEL); - if (!issued_dmas) - return -ENOMEM; - - for (i = 0; i < card->n_targets; i++) { - INIT_LIST_HEAD(&issued_dmas[i]); - cnt = 0; - for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) { - dma = get_tracker_dma(card->ctrl[i].trackers, j); - if (dma == NULL) - continue; - - if (dma->cmd == HW_CMD_BLK_WRITE) - card->ctrl[i].stats.writes_issued--; - else if (dma->cmd == HW_CMD_BLK_DISCARD) - card->ctrl[i].stats.discards_issued--; - else - card->ctrl[i].stats.reads_issued--; - - if (dma->cmd != HW_CMD_BLK_DISCARD) { - dma_unmap_page(&card->dev->dev, dma->dma_addr, - get_dma_size(dma), - dma->cmd == HW_CMD_BLK_WRITE ? - DMA_TO_DEVICE : - DMA_FROM_DEVICE); - } - - list_add_tail(&dma->list, &issued_dmas[i]); - push_tracker(card->ctrl[i].trackers, j); - cnt++; - } - - spin_lock_bh(&card->ctrl[i].queue_lock); - list_splice(&issued_dmas[i], &card->ctrl[i].queue); - - atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); - card->ctrl[i].stats.sw_q_depth += cnt; - card->ctrl[i].e_cnt = 0; - spin_unlock_bh(&card->ctrl[i].queue_lock); - } - - kfree(issued_dmas); - - return 0; -} - -int rsxx_dma_init(void) -{ - rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN); - if (!rsxx_dma_pool) - return -ENOMEM; - - return 0; -} - - -void rsxx_dma_cleanup(void) -{ - kmem_cache_destroy(rsxx_dma_pool); -} - diff --git a/drivers/block/rsxx/rsxx.h b/drivers/block/rsxx/rsxx.h deleted file mode 100644 index 4f84905a6fd2..000000000000 --- a/drivers/block/rsxx/rsxx.h +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -* Filename: rsxx.h -* -* Authors: Joshua Morris -* Philip Kelleher -* -* (C) Copyright 2013 IBM Corporation -*/ - -#ifndef __RSXX_H__ -#define __RSXX_H__ - -/*----------------- IOCTL Definitions -------------------*/ - -#define RSXX_MAX_DATA 8 - -struct rsxx_reg_access { - __u32 addr; - __u32 cnt; - __u32 stat; - __u32 stream; - __u32 data[RSXX_MAX_DATA]; -}; - -#define RSXX_MAX_REG_CNT (RSXX_MAX_DATA * (sizeof(__u32))) - -#define RSXX_IOC_MAGIC 'r' - -#define RSXX_GETREG _IOWR(RSXX_IOC_MAGIC, 0x20, struct rsxx_reg_access) -#define RSXX_SETREG _IOWR(RSXX_IOC_MAGIC, 0x21, struct rsxx_reg_access) - -#endif /* __RSXX_H_ */ diff --git a/drivers/block/rsxx/rsxx_cfg.h b/drivers/block/rsxx/rsxx_cfg.h deleted file mode 100644 index 2b79015f5849..000000000000 --- a/drivers/block/rsxx/rsxx_cfg.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -* Filename: rsXX_cfg.h -* -* Authors: Joshua Morris -* Philip Kelleher -* -* (C) Copyright 2013 IBM Corporation -*/ - -#ifndef __RSXX_CFG_H__ -#define __RSXX_CFG_H__ - -/* NOTE: Config values will be saved in network byte order (i.e. Big endian) */ -#include - -/* - * The card config version must match the driver's expected version. If it does - * not, the DMA interfaces will not be attached and the user will need to - * initialize/upgrade the card configuration using the card config utility. - */ -#define RSXX_CFG_VERSION 4 - -struct card_cfg_hdr { - __u32 version; - __u32 crc; -}; - -struct card_cfg_data { - __u32 block_size; - __u32 stripe_size; - __u32 vendor_id; - __u32 cache_order; - struct { - __u32 mode; /* Disabled, manual, auto-tune... */ - __u32 count; /* Number of intr to coalesce */ - __u32 latency;/* Max wait time (in ns) */ - } intr_coal; -}; - -struct rsxx_card_cfg { - struct card_cfg_hdr hdr; - struct card_cfg_data data; -}; - -/* Vendor ID Values */ -#define RSXX_VENDOR_ID_IBM 0 -#define RSXX_VENDOR_ID_DSI 1 -#define RSXX_VENDOR_COUNT 2 - -/* Interrupt Coalescing Values */ -#define RSXX_INTR_COAL_DISABLED 0 -#define RSXX_INTR_COAL_EXPLICIT 1 -#define RSXX_INTR_COAL_AUTO_TUNE 2 - - -#endif /* __RSXX_CFG_H__ */ - diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h deleted file mode 100644 index 26c320c0d924..000000000000 --- a/drivers/block/rsxx/rsxx_priv.h +++ /dev/null @@ -1,418 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -* Filename: rsxx_priv.h -* -* Authors: Joshua Morris -* Philip Kelleher -* -* (C) Copyright 2013 IBM Corporation -*/ - -#ifndef __RSXX_PRIV_H__ -#define __RSXX_PRIV_H__ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rsxx.h" -#include "rsxx_cfg.h" - -struct proc_cmd; - -#define PCI_DEVICE_ID_FS70_FLASH 0x04A9 -#define PCI_DEVICE_ID_FS80_FLASH 0x04AA - -#define RS70_PCI_REV_SUPPORTED 4 - -#define DRIVER_NAME "rsxx" -#define DRIVER_VERSION "4.0.3.2516" - -/* Block size is 4096 */ -#define RSXX_HW_BLK_SHIFT 12 -#define RSXX_HW_BLK_SIZE (1 << RSXX_HW_BLK_SHIFT) -#define RSXX_HW_BLK_MASK (RSXX_HW_BLK_SIZE - 1) - -#define MAX_CREG_DATA8 32 -#define LOG_BUF_SIZE8 128 - -#define RSXX_MAX_OUTSTANDING_CMDS 255 -#define RSXX_CS_IDX_MASK 0xff - -#define STATUS_BUFFER_SIZE8 4096 -#define COMMAND_BUFFER_SIZE8 4096 - -#define RSXX_MAX_TARGETS 8 - -struct dma_tracker_list; - -/* DMA Command/Status Buffer structure */ -struct rsxx_cs_buffer { - dma_addr_t dma_addr; - void *buf; - u32 idx; -}; - -struct rsxx_dma_stats { - u32 crc_errors; - u32 hard_errors; - u32 soft_errors; - u32 writes_issued; - u32 writes_failed; - u32 reads_issued; - u32 reads_failed; - u32 reads_retried; - u32 discards_issued; - u32 discards_failed; - u32 done_rescheduled; - u32 issue_rescheduled; - u32 dma_sw_err; - u32 dma_hw_fault; - u32 dma_cancelled; - u32 sw_q_depth; /* Number of DMAs on the SW queue. */ - atomic_t hw_q_depth; /* Number of DMAs queued to HW. */ -}; - -struct rsxx_dma_ctrl { - struct rsxx_cardinfo *card; - int id; - void __iomem *regmap; - struct rsxx_cs_buffer status; - struct rsxx_cs_buffer cmd; - u16 e_cnt; - spinlock_t queue_lock; - struct list_head queue; - struct workqueue_struct *issue_wq; - struct work_struct issue_dma_work; - struct workqueue_struct *done_wq; - struct work_struct dma_done_work; - struct timer_list activity_timer; - struct dma_tracker_list *trackers; - struct rsxx_dma_stats stats; - struct mutex work_lock; -}; - -struct rsxx_cardinfo { - struct pci_dev *dev; - unsigned int halt; - unsigned int eeh_state; - - void __iomem *regmap; - spinlock_t irq_lock; - unsigned int isr_mask; - unsigned int ier_mask; - - struct rsxx_card_cfg config; - int config_valid; - - /* Embedded CPU Communication */ - struct { - spinlock_t lock; - bool active; - struct creg_cmd *active_cmd; - struct workqueue_struct *creg_wq; - struct work_struct done_work; - struct list_head queue; - unsigned int q_depth; - /* Cache the creg status to prevent ioreads */ - struct { - u32 stat; - u32 failed_cancel_timer; - u32 creg_timeout; - } creg_stats; - struct timer_list cmd_timer; - struct mutex reset_lock; - int reset; - } creg_ctrl; - - struct { - char tmp[MAX_CREG_DATA8]; - char buf[LOG_BUF_SIZE8]; /* terminated */ - int buf_len; - } log; - - struct workqueue_struct *event_wq; - struct work_struct event_work; - unsigned int state; - u64 size8; - - /* Lock the device attach/detach function */ - struct mutex dev_lock; - - /* Block Device Variables */ - bool bdev_attached; - int disk_id; - int major; - struct gendisk *gendisk; - struct { - /* Used to convert a byte address to a device address. */ - u64 lower_mask; - u64 upper_shift; - u64 upper_mask; - u64 target_mask; - u64 target_shift; - } _stripe; - unsigned int dma_fault; - - int scrub_hard; - - int n_targets; - struct rsxx_dma_ctrl *ctrl; - - struct dentry *debugfs_dir; -}; - -enum rsxx_pci_regmap { - HWID = 0x00, /* Hardware Identification Register */ - SCRATCH = 0x04, /* Scratch/Debug Register */ - RESET = 0x08, /* Reset Register */ - ISR = 0x10, /* Interrupt Status Register */ - IER = 0x14, /* Interrupt Enable Register */ - IPR = 0x18, /* Interrupt Poll Register */ - CB_ADD_LO = 0x20, /* Command Host Buffer Address [31:0] */ - CB_ADD_HI = 0x24, /* Command Host Buffer Address [63:32]*/ - HW_CMD_IDX = 0x28, /* Hardware Processed Command Index */ - SW_CMD_IDX = 0x2C, /* Software Processed Command Index */ - SB_ADD_LO = 0x30, /* Status Host Buffer Address [31:0] */ - SB_ADD_HI = 0x34, /* Status Host Buffer Address [63:32] */ - HW_STATUS_CNT = 0x38, /* Hardware Status Counter */ - SW_STATUS_CNT = 0x3C, /* Deprecated */ - CREG_CMD = 0x40, /* CPU Command Register */ - CREG_ADD = 0x44, /* CPU Address Register */ - CREG_CNT = 0x48, /* CPU Count Register */ - CREG_STAT = 0x4C, /* CPU Status Register */ - CREG_DATA0 = 0x50, /* CPU Data Registers */ - CREG_DATA1 = 0x54, - CREG_DATA2 = 0x58, - CREG_DATA3 = 0x5C, - CREG_DATA4 = 0x60, - CREG_DATA5 = 0x64, - CREG_DATA6 = 0x68, - CREG_DATA7 = 0x6c, - INTR_COAL = 0x70, /* Interrupt Coalescing Register */ - HW_ERROR = 0x74, /* Card Error Register */ - PCI_DEBUG0 = 0x78, /* PCI Debug Registers */ - PCI_DEBUG1 = 0x7C, - PCI_DEBUG2 = 0x80, - PCI_DEBUG3 = 0x84, - PCI_DEBUG4 = 0x88, - PCI_DEBUG5 = 0x8C, - PCI_DEBUG6 = 0x90, - PCI_DEBUG7 = 0x94, - PCI_POWER_THROTTLE = 0x98, - PERF_CTRL = 0x9c, - PERF_TIMER_LO = 0xa0, - PERF_TIMER_HI = 0xa4, - PERF_RD512_LO = 0xa8, - PERF_RD512_HI = 0xac, - PERF_WR512_LO = 0xb0, - PERF_WR512_HI = 0xb4, - PCI_RECONFIG = 0xb8, -}; - -enum rsxx_intr { - CR_INTR_DMA0 = 0x00000001, - CR_INTR_CREG = 0x00000002, - CR_INTR_DMA1 = 0x00000004, - CR_INTR_EVENT = 0x00000008, - CR_INTR_DMA2 = 0x00000010, - CR_INTR_DMA3 = 0x00000020, - CR_INTR_DMA4 = 0x00000040, - CR_INTR_DMA5 = 0x00000080, - CR_INTR_DMA6 = 0x00000100, - CR_INTR_DMA7 = 0x00000200, - CR_INTR_ALL_C = 0x0000003f, - CR_INTR_ALL_G = 0x000003ff, - CR_INTR_DMA_ALL = 0x000003f5, - CR_INTR_ALL = 0xffffffff, -}; - -static inline int CR_INTR_DMA(int N) -{ - static const unsigned int _CR_INTR_DMA[] = { - CR_INTR_DMA0, CR_INTR_DMA1, CR_INTR_DMA2, CR_INTR_DMA3, - CR_INTR_DMA4, CR_INTR_DMA5, CR_INTR_DMA6, CR_INTR_DMA7 - }; - return _CR_INTR_DMA[N]; -} -enum rsxx_pci_reset { - DMA_QUEUE_RESET = 0x00000001, -}; - -enum rsxx_hw_fifo_flush { - RSXX_FLUSH_BUSY = 0x00000002, - RSXX_FLUSH_TIMEOUT = 0x00000004, -}; - -enum rsxx_pci_revision { - RSXX_DISCARD_SUPPORT = 2, - RSXX_EEH_SUPPORT = 3, -}; - -enum rsxx_creg_cmd { - CREG_CMD_TAG_MASK = 0x0000FF00, - CREG_OP_WRITE = 0x000000C0, - CREG_OP_READ = 0x000000E0, -}; - -enum rsxx_creg_addr { - CREG_ADD_CARD_CMD = 0x80001000, - CREG_ADD_CARD_STATE = 0x80001004, - CREG_ADD_CARD_SIZE = 0x8000100c, - CREG_ADD_CAPABILITIES = 0x80001050, - CREG_ADD_LOG = 0x80002000, - CREG_ADD_NUM_TARGETS = 0x80003000, - CREG_ADD_CRAM = 0xA0000000, - CREG_ADD_CONFIG = 0xB0000000, -}; - -enum rsxx_creg_card_cmd { - CARD_CMD_STARTUP = 1, - CARD_CMD_SHUTDOWN = 2, - CARD_CMD_LOW_LEVEL_FORMAT = 3, - CARD_CMD_FPGA_RECONFIG_BR = 4, - CARD_CMD_FPGA_RECONFIG_MAIN = 5, - CARD_CMD_BACKUP = 6, - CARD_CMD_RESET = 7, - CARD_CMD_deprecated = 8, - CARD_CMD_UNINITIALIZE = 9, - CARD_CMD_DSTROY_EMERGENCY = 10, - CARD_CMD_DSTROY_NORMAL = 11, - CARD_CMD_DSTROY_EXTENDED = 12, - CARD_CMD_DSTROY_ABORT = 13, -}; - -enum rsxx_card_state { - CARD_STATE_SHUTDOWN = 0x00000001, - CARD_STATE_STARTING = 0x00000002, - CARD_STATE_FORMATTING = 0x00000004, - CARD_STATE_UNINITIALIZED = 0x00000008, - CARD_STATE_GOOD = 0x00000010, - CARD_STATE_SHUTTING_DOWN = 0x00000020, - CARD_STATE_FAULT = 0x00000040, - CARD_STATE_RD_ONLY_FAULT = 0x00000080, - CARD_STATE_DSTROYING = 0x00000100, -}; - -enum rsxx_led { - LED_DEFAULT = 0x0, - LED_IDENTIFY = 0x1, - LED_SOAK = 0x2, -}; - -enum rsxx_creg_flash_lock { - CREG_FLASH_LOCK = 1, - CREG_FLASH_UNLOCK = 2, -}; - -enum rsxx_card_capabilities { - CARD_CAP_SUBPAGE_WRITES = 0x00000080, -}; - -enum rsxx_creg_stat { - CREG_STAT_STATUS_MASK = 0x00000003, - CREG_STAT_SUCCESS = 0x1, - CREG_STAT_ERROR = 0x2, - CREG_STAT_CHAR_PENDING = 0x00000004, /* Character I/O pending bit */ - CREG_STAT_LOG_PENDING = 0x00000008, /* HW log message pending bit */ - CREG_STAT_TAG_MASK = 0x0000ff00, -}; - -enum rsxx_dma_finish { - FREE_DMA = 0x0, - COMPLETE_DMA = 0x1, -}; - -static inline unsigned int CREG_DATA(int N) -{ - return CREG_DATA0 + (N << 2); -} - -/*----------------- Convenient Log Wrappers -------------------*/ -#define CARD_TO_DEV(__CARD) (&(__CARD)->dev->dev) - -/***** config.c *****/ -int rsxx_load_config(struct rsxx_cardinfo *card); - -/***** core.c *****/ -void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr); -void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr); -void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, - unsigned int intr); -void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, - unsigned int intr); - -/***** dev.c *****/ -int rsxx_attach_dev(struct rsxx_cardinfo *card); -void rsxx_detach_dev(struct rsxx_cardinfo *card); -int rsxx_setup_dev(struct rsxx_cardinfo *card); -void rsxx_destroy_dev(struct rsxx_cardinfo *card); -int rsxx_dev_init(void); -void rsxx_dev_cleanup(void); - -/***** dma.c ****/ -typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card, - void *cb_data, - unsigned int status); -int rsxx_dma_setup(struct rsxx_cardinfo *card); -void rsxx_dma_destroy(struct rsxx_cardinfo *card); -int rsxx_dma_init(void); -int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, - struct list_head *q, - unsigned int done); -int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); -void rsxx_dma_cleanup(void); -void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); -int rsxx_dma_configure(struct rsxx_cardinfo *card); -blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card, - struct bio *bio, - atomic_t *n_dmas, - rsxx_dma_cb cb, - void *cb_data); -int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); -int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); -int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); - -/***** cregs.c *****/ -int rsxx_creg_write(struct rsxx_cardinfo *card, u32 addr, - unsigned int size8, - void *data, - int byte_stream); -int rsxx_creg_read(struct rsxx_cardinfo *card, - u32 addr, - unsigned int size8, - void *data, - int byte_stream); -int rsxx_read_hw_log(struct rsxx_cardinfo *card); -int rsxx_get_card_state(struct rsxx_cardinfo *card, - unsigned int *state); -int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8); -int rsxx_get_num_targets(struct rsxx_cardinfo *card, - unsigned int *n_targets); -int rsxx_get_card_capabilities(struct rsxx_cardinfo *card, - u32 *capabilities); -int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd); -int rsxx_creg_setup(struct rsxx_cardinfo *card); -void rsxx_creg_destroy(struct rsxx_cardinfo *card); -int rsxx_creg_init(void); -void rsxx_creg_cleanup(void); -int rsxx_reg_access(struct rsxx_cardinfo *card, - struct rsxx_reg_access __user *ucmd, - int read); -void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card); -void rsxx_kick_creg_queue(struct rsxx_cardinfo *card); - - - -#endif /* __DRIVERS_BLOCK_RSXX_H__ */ From f18ee3d988157ebcadc9b7e5fd34811938f50223 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 7 Dec 2021 14:55:49 +0100 Subject: [PATCH 15/32] nvme-fabrics: print out valid arguments when reading from /dev/nvme-fabrics Currently applications have a hard time figuring out which nvme-over-fabrics arguments are supported for any given kernel; the ioctl will return an error code on failure, and the application has to guess whether this was due to an invalid argument or due to a connection or controller error. With this patch applications can read a list of supported arguments by simply reading from /dev/nvme-fabrics, allowing them to validate the connection string. Signed-off-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 282d54117e0a..7ae041e2b3fb 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -1069,6 +1069,26 @@ out_unlock: return ret ? ret : count; } +static void __nvmf_concat_opt_tokens(struct seq_file *seq_file) +{ + const struct match_token *tok; + int idx; + + /* + * Add dummy entries for instance and cntlid to + * signal an invalid/non-existing controller + */ + seq_puts(seq_file, "instance=-1,cntlid=-1"); + for (idx = 0; idx < ARRAY_SIZE(opt_tokens); idx++) { + tok = &opt_tokens[idx]; + if (tok->token == NVMF_OPT_ERR) + continue; + seq_puts(seq_file, ","); + seq_puts(seq_file, tok->pattern); + } + seq_puts(seq_file, "\n"); +} + static int nvmf_dev_show(struct seq_file *seq_file, void *private) { struct nvme_ctrl *ctrl; @@ -1077,7 +1097,7 @@ static int nvmf_dev_show(struct seq_file *seq_file, void *private) mutex_lock(&nvmf_dev_mutex); ctrl = seq_file->private; if (!ctrl) { - ret = -EINVAL; + __nvmf_concat_opt_tokens(seq_file); goto out_unlock; } From e4fdb2b167ed225a3793a249c4342da915940b6b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 13 Dec 2021 09:08:47 -0800 Subject: [PATCH 16/32] nvme: increment request genctr on completion The nvme request generation counter is intended to catch duplicate completions. Incrementing the counter on submission means duplicates can only be caught if the request tag is reallocated and dispatched prior to the driver observing the corrupted CQE. Incrementing on completion removes this window, making it possible to detect duplicate completions in consecutive entries. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 -- drivers/nvme/host/nvme.h | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f82c098b1a61..44c375a1edbb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1037,8 +1037,6 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) return BLK_STS_IOERR; } - if (!(ctrl->quirks & NVME_QUIRK_SKIP_CID_GEN)) - nvme_req(req)->genctr++; cmd->common.command_id = nvme_cid(req); trace_nvme_setup_cmd(req, cmd); return ret; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b334af8aa264..a54096ba0552 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -614,6 +614,10 @@ static inline bool nvme_try_complete_req(struct request *req, __le16 status, union nvme_result result) { struct nvme_request *rq = nvme_req(req); + struct nvme_ctrl *ctrl = rq->ctrl; + + if (!(ctrl->quirks & NVME_QUIRK_SKIP_CID_GEN)) + rq->genctr++; rq->status = le16_to_cpu(status) >> 1; rq->result = result; From 3a605e32a7f8f78d844b4272c257029c337a4352 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 22 Dec 2021 17:32:44 +0800 Subject: [PATCH 17/32] nvme: drop unused variable ctrl in nvme_setup_cmd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable 'ctrl' became useless since the code using it was dropped from nvme_setup_cmd() in the commit 292ddf67bbd5 ("nvme: increment request genctr on completion"). Fix it to get rid of this compilation warning in the nvme-5.17 branch: drivers/nvme/host/core.c: In function ‘nvme_setup_cmd’: drivers/nvme/host/core.c:993:20: warning: unused variable ‘ctrl’ [-Wunused-variable] struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; ^~~~ Fixes: 292ddf67bbd5 ("nvme: increment request genctr on completion") Signed-off-by: Geliang Tang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 44c375a1edbb..9666c7bf4379 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -990,7 +990,6 @@ EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) { struct nvme_command *cmd = nvme_req(req)->cmd; - struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; blk_status_t ret = BLK_STS_OK; if (!(req->rq_flags & RQF_DONTPREP)) From e3d347943919f35ccdeed8d2cc62e8c6c12b36cd Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 20 Dec 2021 13:51:45 +0100 Subject: [PATCH 18/32] nvme: add 'iopolicy' module parameter While the 'iopolicy' sysfs attribute can be set at runtime, most storage arrays prefer to use the 'round-robin' iopolicy per default. We can use udev rules to set this, but is getting rather unwieldy for rebranded arrays as we would have to update the udev rules anytime a new array shows up, leading to the same mess we currently have in multipathd for configuring the RDAC arrays. Hence this patch adds a module parameter 'iopolicy' to allow the admin to switch the default, and to do away with the need for a udev rule here. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 +--- drivers/nvme/host/multipath.c | 41 ++++++++++++++++++++++++++++++----- drivers/nvme/host/nvme.h | 4 ++++ 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9666c7bf4379..4fc794d9c2f4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2746,9 +2746,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) return -EINVAL; } subsys->awupf = le16_to_cpu(id->awupf); -#ifdef CONFIG_NVME_MULTIPATH - subsys->iopolicy = NVME_IOPOLICY_NUMA; -#endif + nvme_mpath_default_iopolicy(subsys); subsys->dev.class = nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 7f2071f2460c..892bd5dcb46b 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -13,6 +13,42 @@ module_param(multipath, bool, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); +static const char *nvme_iopolicy_names[] = { + [NVME_IOPOLICY_NUMA] = "numa", + [NVME_IOPOLICY_RR] = "round-robin", +}; + +static int iopolicy = NVME_IOPOLICY_NUMA; + +static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp) +{ + if (!val) + return -EINVAL; + if (!strncmp(val, "numa", 4)) + iopolicy = NVME_IOPOLICY_NUMA; + else if (!strncmp(val, "round-robin", 11)) + iopolicy = NVME_IOPOLICY_RR; + else + return -EINVAL; + + return 0; +} + +static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp) +{ + return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]); +} + +module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy, + &iopolicy, 0644); +MODULE_PARM_DESC(iopolicy, + "Default multipath I/O policy; 'numa' (default) or 'round-robin'"); + +void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) +{ + subsys->iopolicy = iopolicy; +} + void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) { struct nvme_ns_head *h; @@ -706,11 +742,6 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl) struct device_attribute subsys_attr_##_name = \ __ATTR(_name, _mode, _show, _store) -static const char *nvme_iopolicy_names[] = { - [NVME_IOPOLICY_NUMA] = "numa", - [NVME_IOPOLICY_RR] = "round-robin", -}; - static ssize_t nvme_subsys_iopolicy_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a54096ba0552..fe224016418e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -767,6 +767,7 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) void nvme_mpath_unfreeze(struct nvme_subsystem *subsys); void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); +void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys); bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); @@ -864,6 +865,9 @@ static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) { } +static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_revalidate_zones(struct nvme_ns *ns); From 19768f80cf23834e65482f1667ff54192d469fee Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 24 Dec 2021 09:08:31 +0800 Subject: [PATCH 19/32] block: null_blk: only set set->nr_maps as 3 if active poll_queues is > 0 It isn't correct to set set->nr_maps as 3 if g_poll_queues is > 0 since we can change it via configfs for null_blk device created there, so only set it as 3 if active poll_queues is > 0. Fixes divide zero exception reported by Shinichiro. Fixes: 2bfdbe8b7ebd ("null_blk: allow zero poll queues") Reported-by: Shinichiro Kawasaki Signed-off-by: Ming Lei Reviewed-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20211224010831.1521805-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 6be6ccd4a28f..13004beb48ca 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1892,7 +1892,7 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) if (g_shared_tag_bitmap) set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; set->driver_data = nullb; - if (g_poll_queues) + if (poll_queues) set->nr_maps = 3; else set->nr_maps = 1; From d5dbcca70182501bed99de85c224cef04c38ed92 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 3 Jan 2022 17:24:08 +0100 Subject: [PATCH 20/32] pktcdvd: convert to use attribute groups There is no need to create kobject children of the pktcdvd device just to display a subdirectory name. Instead, use a named attribute group which removes the extra kobjects and also fixes the userspace race where the device is created yet tools like libudev can not see the attributes as they think the subdirectories are some other sort of device. Cc: linux-block@vger.kernel.org Cc: Jens Axboe Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220103162408.742003-1-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 281 ++++++++++++++++++++-------------------- include/linux/pktcdvd.h | 10 -- 2 files changed, 137 insertions(+), 154 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 713b7dcf39f9..2b6b70a39e76 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -113,57 +113,10 @@ static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd) return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1); } -/* - * create and register a pktcdvd kernel object. - */ -static struct pktcdvd_kobj* pkt_kobj_create(struct pktcdvd_device *pd, - const char* name, - struct kobject* parent, - struct kobj_type* ktype) -{ - struct pktcdvd_kobj *p; - int error; - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return NULL; - p->pd = pd; - error = kobject_init_and_add(&p->kobj, ktype, parent, "%s", name); - if (error) { - kobject_put(&p->kobj); - return NULL; - } - kobject_uevent(&p->kobj, KOBJ_ADD); - return p; -} -/* - * remove a pktcdvd kernel object. - */ -static void pkt_kobj_remove(struct pktcdvd_kobj *p) -{ - if (p) - kobject_put(&p->kobj); -} -/* - * default release function for pktcdvd kernel objects. - */ -static void pkt_kobj_release(struct kobject *kobj) -{ - kfree(to_pktcdvdkobj(kobj)); -} - - /********************************************************** - * * sysfs interface for pktcdvd * by (C) 2006 Thomas Maier - * - **********************************************************/ - -#define DEF_ATTR(_obj,_name,_mode) \ - static struct attribute _obj = { .name = _name, .mode = _mode } - -/********************************************************** + /sys/class/pktcdvd/pktcdvd[0-7]/ stat/reset stat/packets_started @@ -176,75 +129,94 @@ static void pkt_kobj_release(struct kobject *kobj) write_queue/congestion_on **********************************************************/ -DEF_ATTR(kobj_pkt_attr_st1, "reset", 0200); -DEF_ATTR(kobj_pkt_attr_st2, "packets_started", 0444); -DEF_ATTR(kobj_pkt_attr_st3, "packets_finished", 0444); -DEF_ATTR(kobj_pkt_attr_st4, "kb_written", 0444); -DEF_ATTR(kobj_pkt_attr_st5, "kb_read", 0444); -DEF_ATTR(kobj_pkt_attr_st6, "kb_read_gather", 0444); - -static struct attribute *kobj_pkt_attrs_stat[] = { - &kobj_pkt_attr_st1, - &kobj_pkt_attr_st2, - &kobj_pkt_attr_st3, - &kobj_pkt_attr_st4, - &kobj_pkt_attr_st5, - &kobj_pkt_attr_st6, - NULL -}; - -DEF_ATTR(kobj_pkt_attr_wq1, "size", 0444); -DEF_ATTR(kobj_pkt_attr_wq2, "congestion_off", 0644); -DEF_ATTR(kobj_pkt_attr_wq3, "congestion_on", 0644); - -static struct attribute *kobj_pkt_attrs_wqueue[] = { - &kobj_pkt_attr_wq1, - &kobj_pkt_attr_wq2, - &kobj_pkt_attr_wq3, - NULL -}; - -static ssize_t kobj_pkt_show(struct kobject *kobj, - struct attribute *attr, char *data) +static ssize_t packets_started_show(struct device *dev, + struct device_attribute *attr, char *buf) { - struct pktcdvd_device *pd = to_pktcdvdkobj(kobj)->pd; - int n = 0; - int v; - if (strcmp(attr->name, "packets_started") == 0) { - n = sprintf(data, "%lu\n", pd->stats.pkt_started); + struct pktcdvd_device *pd = dev_get_drvdata(dev); - } else if (strcmp(attr->name, "packets_finished") == 0) { - n = sprintf(data, "%lu\n", pd->stats.pkt_ended); + return sysfs_emit(buf, "%lu\n", pd->stats.pkt_started); +} +static DEVICE_ATTR_RO(packets_started); - } else if (strcmp(attr->name, "kb_written") == 0) { - n = sprintf(data, "%lu\n", pd->stats.secs_w >> 1); +static ssize_t packets_finished_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); - } else if (strcmp(attr->name, "kb_read") == 0) { - n = sprintf(data, "%lu\n", pd->stats.secs_r >> 1); + return sysfs_emit(buf, "%lu\n", pd->stats.pkt_ended); +} +static DEVICE_ATTR_RO(packets_finished); - } else if (strcmp(attr->name, "kb_read_gather") == 0) { - n = sprintf(data, "%lu\n", pd->stats.secs_rg >> 1); +static ssize_t kb_written_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); - } else if (strcmp(attr->name, "size") == 0) { - spin_lock(&pd->lock); - v = pd->bio_queue_size; - spin_unlock(&pd->lock); - n = sprintf(data, "%d\n", v); + return sysfs_emit(buf, "%lu\n", pd->stats.secs_w >> 1); +} +static DEVICE_ATTR_RO(kb_written); - } else if (strcmp(attr->name, "congestion_off") == 0) { - spin_lock(&pd->lock); - v = pd->write_congestion_off; - spin_unlock(&pd->lock); - n = sprintf(data, "%d\n", v); +static ssize_t kb_read_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); - } else if (strcmp(attr->name, "congestion_on") == 0) { - spin_lock(&pd->lock); - v = pd->write_congestion_on; - spin_unlock(&pd->lock); - n = sprintf(data, "%d\n", v); + return sysfs_emit(buf, "%lu\n", pd->stats.secs_r >> 1); +} +static DEVICE_ATTR_RO(kb_read); + +static ssize_t kb_read_gather_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%lu\n", pd->stats.secs_rg >> 1); +} +static DEVICE_ATTR_RO(kb_read_gather); + +static ssize_t reset_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + + if (len > 0) { + pd->stats.pkt_started = 0; + pd->stats.pkt_ended = 0; + pd->stats.secs_w = 0; + pd->stats.secs_rg = 0; + pd->stats.secs_r = 0; } + return len; +} +static DEVICE_ATTR_WO(reset); + +static struct attribute *pkt_stat_attrs[] = { + &dev_attr_packets_finished.attr, + &dev_attr_packets_started.attr, + &dev_attr_kb_read.attr, + &dev_attr_kb_written.attr, + &dev_attr_kb_read_gather.attr, + &dev_attr_reset.attr, + NULL, +}; + +static const struct attribute_group pkt_stat_group = { + .name = "stat", + .attrs = pkt_stat_attrs, +}; + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int n; + + spin_lock(&pd->lock); + n = sysfs_emit(buf, "%d\n", pd->bio_queue_size); + spin_unlock(&pd->lock); return n; } +static DEVICE_ATTR_RO(size); static void init_write_congestion_marks(int* lo, int* hi) { @@ -263,30 +235,56 @@ static void init_write_congestion_marks(int* lo, int* hi) } } -static ssize_t kobj_pkt_store(struct kobject *kobj, - struct attribute *attr, - const char *data, size_t len) +static ssize_t congestion_off_show(struct device *dev, + struct device_attribute *attr, char *buf) { - struct pktcdvd_device *pd = to_pktcdvdkobj(kobj)->pd; + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int n; + + spin_lock(&pd->lock); + n = sysfs_emit(buf, "%d\n", pd->write_congestion_off); + spin_unlock(&pd->lock); + return n; +} + +static ssize_t congestion_off_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); int val; - if (strcmp(attr->name, "reset") == 0 && len > 0) { - pd->stats.pkt_started = 0; - pd->stats.pkt_ended = 0; - pd->stats.secs_w = 0; - pd->stats.secs_rg = 0; - pd->stats.secs_r = 0; - - } else if (strcmp(attr->name, "congestion_off") == 0 - && sscanf(data, "%d", &val) == 1) { + if (sscanf(buf, "%d", &val) == 1) { spin_lock(&pd->lock); pd->write_congestion_off = val; init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on); spin_unlock(&pd->lock); + } + return len; +} +static DEVICE_ATTR_RW(congestion_off); - } else if (strcmp(attr->name, "congestion_on") == 0 - && sscanf(data, "%d", &val) == 1) { +static ssize_t congestion_on_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int n; + + spin_lock(&pd->lock); + n = sysfs_emit(buf, "%d\n", pd->write_congestion_on); + spin_unlock(&pd->lock); + return n; +} + +static ssize_t congestion_on_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int val; + + if (sscanf(buf, "%d", &val) == 1) { spin_lock(&pd->lock); pd->write_congestion_on = val; init_write_congestion_marks(&pd->write_congestion_off, @@ -295,44 +293,39 @@ static ssize_t kobj_pkt_store(struct kobject *kobj, } return len; } +static DEVICE_ATTR_RW(congestion_on); -static const struct sysfs_ops kobj_pkt_ops = { - .show = kobj_pkt_show, - .store = kobj_pkt_store +static struct attribute *pkt_wq_attrs[] = { + &dev_attr_congestion_on.attr, + &dev_attr_congestion_off.attr, + &dev_attr_size.attr, + NULL, }; -static struct kobj_type kobj_pkt_type_stat = { - .release = pkt_kobj_release, - .sysfs_ops = &kobj_pkt_ops, - .default_attrs = kobj_pkt_attrs_stat + +static const struct attribute_group pkt_wq_group = { + .name = "write_queue", + .attrs = pkt_wq_attrs, }; -static struct kobj_type kobj_pkt_type_wqueue = { - .release = pkt_kobj_release, - .sysfs_ops = &kobj_pkt_ops, - .default_attrs = kobj_pkt_attrs_wqueue + +static const struct attribute_group *pkt_groups[] = { + &pkt_stat_group, + &pkt_wq_group, + NULL, }; static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) { if (class_pktcdvd) { - pd->dev = device_create(class_pktcdvd, NULL, MKDEV(0, 0), NULL, - "%s", pd->name); + pd->dev = device_create_with_groups(class_pktcdvd, NULL, + MKDEV(0, 0), pd, pkt_groups, + "%s", pd->name); if (IS_ERR(pd->dev)) pd->dev = NULL; } - if (pd->dev) { - pd->kobj_stat = pkt_kobj_create(pd, "stat", - &pd->dev->kobj, - &kobj_pkt_type_stat); - pd->kobj_wqueue = pkt_kobj_create(pd, "write_queue", - &pd->dev->kobj, - &kobj_pkt_type_wqueue); - } } static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd) { - pkt_kobj_remove(pd->kobj_stat); - pkt_kobj_remove(pd->kobj_wqueue); if (class_pktcdvd) device_unregister(pd->dev); } diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index c391e694aa26..f9c5ac80d59b 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -152,14 +152,6 @@ struct packet_stacked_data }; #define PSD_POOL_SIZE 64 -struct pktcdvd_kobj -{ - struct kobject kobj; - struct pktcdvd_device *pd; -}; -#define to_pktcdvdkobj(_k) \ - ((struct pktcdvd_kobj*)container_of(_k,struct pktcdvd_kobj,kobj)) - struct pktcdvd_device { struct block_device *bdev; /* dev attached */ @@ -197,8 +189,6 @@ struct pktcdvd_device int write_congestion_on; struct device *dev; /* sysfs pktcdvd[0-7] dev */ - struct pktcdvd_kobj *kobj_stat; /* sysfs pktcdvd[0-7]/stat/ */ - struct pktcdvd_kobj *kobj_wqueue; /* sysfs pktcdvd[0-7]/write_queue/ */ struct dentry *dfs_d_root; /* debugfs: devname directory */ struct dentry *dfs_f_info; /* debugfs: info file */ From 050f461e28c5d13f327353d660ffad2603ce7ac1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 4 Jan 2022 17:29:47 +0100 Subject: [PATCH 21/32] block/rnbd-clt-sysfs: use default_groups in kobj_type There are currently 2 ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the rnbd controller sysfs code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Cc: "Md. Haris Iqbal" Cc: Jack Wang Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Acked-by: Jack Wang Link: https://lore.kernel.org/r/20220104162947.1320936-1-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 44e45af00e83..2be5d87a3ca6 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -452,6 +452,7 @@ static struct attribute *rnbd_dev_attrs[] = { &rnbd_clt_nr_poll_queues.attr, NULL, }; +ATTRIBUTE_GROUPS(rnbd_dev); void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) { @@ -474,7 +475,7 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) static struct kobj_type rnbd_dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = rnbd_dev_attrs, + .default_groups = rnbd_dev_groups, }; static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) From 770b1d216d7371c94c88713824da4be4bc39a4e0 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 15 Nov 2021 17:23:17 -0800 Subject: [PATCH 22/32] md/raid5: play nice with PREEMPT_RT raid_run_ops() relies on the implicitly disabled preemption for its percpu ops, although this is really about CPU locality. This breaks RT semantics as it can take regular (and thus sleeping) spinlocks, such as stripe_lock. Add a local_lock such that non-RT does not change and continues to be just map to preempt_disable/enable, but makes RT happy as the region will use a per-CPU spinlock and thus be preemptible and still guarantee CPU locality. Signed-off-by: Davidlohr Bueso Signed-off-by: Song Liu --- drivers/md/raid5.c | 11 ++++++----- drivers/md/raid5.h | 4 +++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9c1a5877cf9f..1240a5c16af8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2215,10 +2215,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) struct r5conf *conf = sh->raid_conf; int level = conf->level; struct raid5_percpu *percpu; - unsigned long cpu; - cpu = get_cpu(); - percpu = per_cpu_ptr(conf->percpu, cpu); + local_lock(&conf->percpu->lock); + percpu = this_cpu_ptr(conf->percpu); if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; @@ -2271,13 +2270,14 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) BUG(); } - if (overlap_clear && !sh->batch_head) + if (overlap_clear && !sh->batch_head) { for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_and_clear_bit(R5_Overlap, &dev->flags)) wake_up(&sh->raid_conf->wait_for_overlap); } - put_cpu(); + } + local_unlock(&conf->percpu->lock); } static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) @@ -7052,6 +7052,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu return -ENOMEM; } + local_lock_init(&percpu->lock); return 0; } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 5c05acf20e1f..9e8486a9e445 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -4,6 +4,7 @@ #include #include +#include /* * @@ -640,7 +641,8 @@ struct r5conf { * lists and performing address * conversions */ - int scribble_obj_size; + int scribble_obj_size; + local_lock_t lock; } __percpu *percpu; int scribble_disks; int scribble_sectors; From a92ce0feffeed8b91f02dac85246d1205e4a64b6 Mon Sep 17 00:00:00 2001 From: Mariusz Tkaczyk Date: Fri, 17 Dec 2021 10:29:55 +0100 Subject: [PATCH 23/32] md: drop queue limitation for RAID1 and RAID10 As suggested by Neil Brown[1], this limitation seems to be deprecated. With plugging in use, writes are processed behind the raid thread and conf->pending_count is not increased. This limitation occurs only if caller doesn't use plugs. It can be avoided and often it is (with plugging). There are no reports that queue is growing to enormous size so remove queue limitation for non-plugged IOs too. [1] https://lore.kernel.org/linux-raid/162496301481.7211.18031090130574610495@noble.neil.brown.name Signed-off-by: Mariusz Tkaczyk Signed-off-by: Song Liu --- drivers/md/raid1-10.c | 6 ------ drivers/md/raid1.c | 7 ------- drivers/md/raid10.c | 7 ------- 3 files changed, 20 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 54db34163968..83f9a4f3d82e 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -22,12 +22,6 @@ #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) -/* When there are this many requests queue to be written by - * the raid thread, we become 'congested' to provide back-pressure - * for writeback. - */ -static int max_queued_requests = 1024; - /* for managing resync I/O pages */ struct resync_pages { void *raid_bio; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7dc8026cf6ee..eeaedd6e0ce1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1358,12 +1358,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; - if (conf->pending_count >= max_queued_requests) { - md_wakeup_thread(mddev->thread); - raid1_log(mddev, "wait queued"); - wait_event(conf->wait_barrier, - conf->pending_count < max_queued_requests); - } /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio @@ -3410,4 +3404,3 @@ MODULE_ALIAS("md-personality-3"); /* RAID1 */ MODULE_ALIAS("md-raid1"); MODULE_ALIAS("md-level-1"); -module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index dde98f65bd04..c683ba138b58 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1387,12 +1387,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, conf->reshape_safe = mddev->reshape_position; } - if (conf->pending_count >= max_queued_requests) { - md_wakeup_thread(mddev->thread); - raid10_log(mddev, "wait queued"); - wait_event(conf->wait_barrier, - conf->pending_count < max_queued_requests); - } /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio @@ -5243,4 +5237,3 @@ MODULE_ALIAS("md-personality-9"); /* RAID10 */ MODULE_ALIAS("md-raid10"); MODULE_ALIAS("md-level-10"); -module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); From f51d46d0e7cb5b8494aa534d276a9d8915a2443d Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Tue, 21 Dec 2021 20:06:19 +0000 Subject: [PATCH 24/32] md: add support for REQ_NOWAIT commit 021a24460dc2 ("block: add QUEUE_FLAG_NOWAIT") added support for checking whether a given bdev supports handling of REQ_NOWAIT or not. Since then commit 6abc49468eea ("dm: add support for REQ_NOWAIT and enable it for linear target") added support for REQ_NOWAIT for dm. This uses a similar approach to incorporate REQ_NOWAIT for md based bios. This patch was tested using t/io_uring tool within FIO. A nvme drive was partitioned into 2 partitions and a simple raid 0 configuration /dev/md0 was created. md0 : active raid0 nvme4n1p1[1] nvme4n1p2[0] 937423872 blocks super 1.2 512k chunks Before patch: $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100 Running top while the above runs: $ ps -eL | grep $(pidof io_uring) 38396 38396 pts/2 00:00:00 io_uring 38396 38397 pts/2 00:00:15 io_uring 38396 38398 pts/2 00:00:13 iou-wrk-38397 We can see iou-wrk-38397 io worker thread created which gets created when io_uring sees that the underlying device (/dev/md0 in this case) doesn't support nowait. After patch: $ ./t/io_uring /dev/md0 -p 0 -a 0 -d 1 -r 100 Running top while the above runs: $ ps -eL | grep $(pidof io_uring) 38341 38341 pts/2 00:10:22 io_uring 38341 38342 pts/2 00:10:37 io_uring After running this patch, we don't see any io worker thread being created which indicated that io_uring saw that the underlying device does support nowait. This is the exact behaviour noticed on a dm device which also supports nowait. For all the other raid personalities except raid0, we would need to train pieces which involves make_request fn in order for them to correctly handle REQ_NOWAIT. Reviewed-by: Jens Axboe Signed-off-by: Vishal Verma Signed-off-by: Song Liu --- drivers/md/md.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7fbf6f0ac01b..c6de7d4bfe4d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -418,6 +418,12 @@ check_suspended: rcu_read_lock(); if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); + /* Bail out if REQ_NOWAIT is set for the bio */ + if (bio->bi_opf & REQ_NOWAIT) { + rcu_read_unlock(); + bio_wouldblock_error(bio); + return; + } for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); @@ -5787,6 +5793,7 @@ int md_run(struct mddev *mddev) int err; struct md_rdev *rdev; struct md_personality *pers; + bool nowait = true; if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ @@ -5857,8 +5864,13 @@ int md_run(struct mddev *mddev) } } sysfs_notify_dirent_safe(rdev->sysfs_state); + nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev)); } + /* Set the NOWAIT flags if all underlying devices support it */ + if (nowait) + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); + if (!bioset_initialized(&mddev->bio_set)) { err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) @@ -7002,6 +7014,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread) md_update_sb(mddev, 1); + /* + * If the new disk does not support REQ_NOWAIT, + * disable on the whole MD. + */ + if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) { + pr_info("%s: Disabling nowait because %s does not support nowait\n", + mdname(mddev), bdevname(rdev->bdev, b)); + blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); + } /* * Kick recovery, maybe this spare has to be added to the * array immediately. From 5aa705039c4fca84575539bfa2b8a28454a3d2ca Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Tue, 21 Dec 2021 20:06:20 +0000 Subject: [PATCH 25/32] md: raid1 add nowait support This adds nowait support to the RAID1 driver. It makes RAID1 driver return with EAGAIN for situations where it could wait for eg: - Waiting for the barrier, wait_barrier() fn is modified to return bool to support error for wait barriers. It returns true in case of wait or if wait is not required and returns false if wait was required but not performed to support nowait. Reviewed-by: Jens Axboe Signed-off-by: Vishal Verma Signed-off-by: Song Liu --- drivers/md/raid1.c | 76 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index eeaedd6e0ce1..02db187090fa 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -929,8 +929,10 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr) wake_up(&conf->wait_barrier); } -static void _wait_barrier(struct r1conf *conf, int idx) +static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait) { + bool ret = true; + /* * We need to increase conf->nr_pending[idx] very early here, * then raise_barrier() can be blocked when it waits for @@ -961,7 +963,7 @@ static void _wait_barrier(struct r1conf *conf, int idx) */ if (!READ_ONCE(conf->array_frozen) && !atomic_read(&conf->barrier[idx])) - return; + return ret; /* * After holding conf->resync_lock, conf->nr_pending[idx] @@ -979,18 +981,27 @@ static void _wait_barrier(struct r1conf *conf, int idx) */ wake_up(&conf->wait_barrier); /* Wait for the barrier in same barrier unit bucket to drop. */ - wait_event_lock_irq(conf->wait_barrier, - !conf->array_frozen && - !atomic_read(&conf->barrier[idx]), - conf->resync_lock); - atomic_inc(&conf->nr_pending[idx]); + + /* Return false when nowait flag is set */ + if (nowait) { + ret = false; + } else { + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen && + !atomic_read(&conf->barrier[idx]), + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + } + atomic_dec(&conf->nr_waiting[idx]); spin_unlock_irq(&conf->resync_lock); + return ret; } -static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) +static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait) { int idx = sector_to_idx(sector_nr); + bool ret = true; /* * Very similar to _wait_barrier(). The difference is, for read @@ -1002,7 +1013,7 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) atomic_inc(&conf->nr_pending[idx]); if (!READ_ONCE(conf->array_frozen)) - return; + return ret; spin_lock_irq(&conf->resync_lock); atomic_inc(&conf->nr_waiting[idx]); @@ -1013,19 +1024,28 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) */ wake_up(&conf->wait_barrier); /* Wait for array to be unfrozen */ - wait_event_lock_irq(conf->wait_barrier, - !conf->array_frozen, - conf->resync_lock); - atomic_inc(&conf->nr_pending[idx]); + + /* Return false when nowait flag is set */ + if (nowait) { + /* Return false when nowait flag is set */ + ret = false; + } else { + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen, + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + } + atomic_dec(&conf->nr_waiting[idx]); spin_unlock_irq(&conf->resync_lock); + return ret; } -static void wait_barrier(struct r1conf *conf, sector_t sector_nr) +static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait) { int idx = sector_to_idx(sector_nr); - _wait_barrier(conf, idx); + return _wait_barrier(conf, idx, nowait); } static void _allow_barrier(struct r1conf *conf, int idx) @@ -1236,7 +1256,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, * Still need barrier for READ in case that whole * array is frozen. */ - wait_read_barrier(conf, bio->bi_iter.bi_sector); + if (!wait_read_barrier(conf, bio->bi_iter.bi_sector, + bio->bi_opf & REQ_NOWAIT)) { + bio_wouldblock_error(bio); + return; + } if (!r1_bio) r1_bio = alloc_r1bio(mddev, bio); @@ -1336,6 +1360,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, bio->bi_iter.bi_sector, bio_end_sector(bio))) { DEFINE_WAIT(w); + if (bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return; + } for (;;) { prepare_to_wait(&conf->wait_barrier, &w, TASK_IDLE); @@ -1353,7 +1381,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ - wait_barrier(conf, bio->bi_iter.bi_sector); + if (!wait_barrier(conf, bio->bi_iter.bi_sector, + bio->bi_opf & REQ_NOWAIT)) { + bio_wouldblock_error(bio); + return; + } r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; @@ -1452,9 +1484,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; allow_barrier(conf, bio->bi_iter.bi_sector); + + if (bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return; + } raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf, bio->bi_iter.bi_sector); + wait_barrier(conf, bio->bi_iter.bi_sector, false); goto retry_write; } @@ -1681,7 +1718,7 @@ static void close_sync(struct r1conf *conf) int idx; for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) { - _wait_barrier(conf, idx); + _wait_barrier(conf, idx, false); _allow_barrier(conf, idx); } @@ -3403,4 +3440,3 @@ MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); MODULE_ALIAS("md-personality-3"); /* RAID1 */ MODULE_ALIAS("md-raid1"); MODULE_ALIAS("md-level-1"); - From c9aa889b035fca4598ae985a0f0c76ebbb547ad2 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Tue, 21 Dec 2021 20:06:21 +0000 Subject: [PATCH 26/32] md: raid10 add nowait support This adds nowait support to the RAID10 driver. Very similar to raid1 driver changes. It makes RAID10 driver return with EAGAIN for situations where it could wait for eg: - Waiting for the barrier, - Reshape operation, - Discard operation. wait_barrier() and regular_request_wait() fn are modified to return bool to support error for wait barriers. They returns true in case of wait or if wait is not required and returns false if wait was required but not performed to support nowait. Reviewed-by: Jens Axboe Signed-off-by: Vishal Verma Signed-off-by: Song Liu --- drivers/md/raid10.c | 100 +++++++++++++++++++++++++++++--------------- 1 file changed, 67 insertions(+), 33 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c683ba138b58..2b969f70a31f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -952,8 +952,10 @@ static void lower_barrier(struct r10conf *conf) wake_up(&conf->wait_barrier); } -static void wait_barrier(struct r10conf *conf) +static bool wait_barrier(struct r10conf *conf, bool nowait) { + bool ret = true; + spin_lock_irq(&conf->resync_lock); if (conf->barrier) { struct bio_list *bio_list = current->bio_list; @@ -967,27 +969,35 @@ static void wait_barrier(struct r10conf *conf) * that queue to get the nr_pending * count down. */ - raid10_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (atomic_read(&conf->nr_pending) && - bio_list && - (!bio_list_empty(&bio_list[0]) || - !bio_list_empty(&bio_list[1]))) || - /* move on if recovery thread is - * blocked by us - */ - (conf->mddev->thread->tsk == current && - test_bit(MD_RECOVERY_RUNNING, - &conf->mddev->recovery) && - conf->nr_queued > 0), - conf->resync_lock); + /* Return false when nowait flag is set */ + if (nowait) { + ret = false; + } else { + raid10_log(conf->mddev, "wait barrier"); + wait_event_lock_irq(conf->wait_barrier, + !conf->barrier || + (atomic_read(&conf->nr_pending) && + bio_list && + (!bio_list_empty(&bio_list[0]) || + !bio_list_empty(&bio_list[1]))) || + /* move on if recovery thread is + * blocked by us + */ + (conf->mddev->thread->tsk == current && + test_bit(MD_RECOVERY_RUNNING, + &conf->mddev->recovery) && + conf->nr_queued > 0), + conf->resync_lock); + } conf->nr_waiting--; if (!conf->nr_waiting) wake_up(&conf->wait_barrier); } - atomic_inc(&conf->nr_pending); + /* Only increment nr_pending when we wait */ + if (ret) + atomic_inc(&conf->nr_pending); spin_unlock_irq(&conf->resync_lock); + return ret; } static void allow_barrier(struct r10conf *conf) @@ -1098,21 +1108,30 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) * currently. * 2. If IO spans the reshape position. Need to wait for reshape to pass. */ -static void regular_request_wait(struct mddev *mddev, struct r10conf *conf, +static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, struct bio *bio, sector_t sectors) { - wait_barrier(conf); + /* Bail out if REQ_NOWAIT is set for the bio */ + if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) { + bio_wouldblock_error(bio); + return false; + } while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_iter.bi_sector < conf->reshape_progress && bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { - raid10_log(conf->mddev, "wait reshape"); allow_barrier(conf); + if (bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return false; + } + raid10_log(conf->mddev, "wait reshape"); wait_event(conf->wait_barrier, conf->reshape_progress <= bio->bi_iter.bi_sector || conf->reshape_progress >= bio->bi_iter.bi_sector + sectors); - wait_barrier(conf); + wait_barrier(conf, false); } + return true; } static void raid10_read_request(struct mddev *mddev, struct bio *bio, @@ -1157,7 +1176,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, rcu_read_unlock(); } - regular_request_wait(mddev, conf, bio, r10_bio->sectors); + if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) + return; rdev = read_balance(conf, r10_bio, &max_sectors); if (!rdev) { if (err_rdev) { @@ -1179,7 +1199,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, bio_chain(split, bio); allow_barrier(conf); submit_bio_noacct(bio); - wait_barrier(conf); + wait_barrier(conf, false); bio = split; r10_bio->master_bio = bio; r10_bio->sectors = max_sectors; @@ -1338,7 +1358,7 @@ retry_wait: raid10_log(conf->mddev, "%s wait rdev %d blocked", __func__, blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); + wait_barrier(conf, false); goto retry_wait; } } @@ -1356,6 +1376,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, bio->bi_iter.bi_sector, bio_end_sector(bio)))) { DEFINE_WAIT(w); + /* Bail out if REQ_NOWAIT is set for the bio */ + if (bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return; + } for (;;) { prepare_to_wait(&conf->wait_barrier, &w, TASK_IDLE); @@ -1368,7 +1393,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, } sectors = r10_bio->sectors; - regular_request_wait(mddev, conf, bio, sectors); + if (!regular_request_wait(mddev, conf, bio, sectors)) + return; if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && (mddev->reshape_backwards ? (bio->bi_iter.bi_sector < conf->reshape_safe && @@ -1380,6 +1406,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); md_wakeup_thread(mddev->thread); + if (bio->bi_opf & REQ_NOWAIT) { + allow_barrier(conf); + bio_wouldblock_error(bio); + return; + } raid10_log(conf->mddev, "wait reshape metadata"); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); @@ -1476,7 +1507,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, bio_chain(split, bio); allow_barrier(conf); submit_bio_noacct(bio); - wait_barrier(conf); + wait_barrier(conf, false); bio = split; r10_bio->master_bio = bio; } @@ -1601,7 +1632,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return -EAGAIN; - wait_barrier(conf); + if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) { + bio_wouldblock_error(bio); + return 0; + } + wait_barrier(conf, false); /* * Check reshape again to avoid reshape happens after checking @@ -1643,7 +1678,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) allow_barrier(conf); /* Resend the fist split part */ submit_bio_noacct(split); - wait_barrier(conf); + wait_barrier(conf, false); } div_u64_rem(bio_end, stripe_size, &remainder); if (remainder) { @@ -1654,7 +1689,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) /* Resend the second split part */ submit_bio_noacct(bio); bio = split; - wait_barrier(conf); + wait_barrier(conf, false); } bio_start = bio->bi_iter.bi_sector; @@ -1810,7 +1845,7 @@ retry_discard: end_disk_offset += geo->stride; atomic_inc(&first_r10bio->remaining); raid_end_discard_bio(r10_bio); - wait_barrier(conf); + wait_barrier(conf, false); goto retry_discard; } @@ -2005,7 +2040,7 @@ static void print_conf(struct r10conf *conf) static void close_sync(struct r10conf *conf) { - wait_barrier(conf); + wait_barrier(conf, false); allow_barrier(conf); mempool_exit(&conf->r10buf_pool); @@ -4813,7 +4848,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, if (need_flush || time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Need to update reshape_position in metadata */ - wait_barrier(conf); + wait_barrier(conf, false); mddev->reshape_position = conf->reshape_progress; if (mddev->reshape_backwards) mddev->curr_resync_completed = raid10_size(mddev, 0, 0) @@ -5236,4 +5271,3 @@ MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); MODULE_ALIAS("md-personality-9"); /* RAID10 */ MODULE_ALIAS("md-raid10"); MODULE_ALIAS("md-level-10"); - From bf2c411bb1cfc45f73eb6c55b5755bcb990063ae Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Tue, 21 Dec 2021 20:06:22 +0000 Subject: [PATCH 27/32] md: raid456 add nowait support Returns EAGAIN in case the raid456 driver would block waiting for reshape. Reviewed-by: Jens Axboe Signed-off-by: Vishal Verma Signed-off-by: Song Liu --- drivers/md/raid5.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1240a5c16af8..beb544be9058 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5686,6 +5686,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) struct stripe_head *sh; int stripe_sectors; + /* We need to handle this when io_uring supports discard/trim */ + if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT)) + return; + if (mddev->reshape_position != MaxSector) /* Skip discard while reshape is happening */ return; @@ -5819,6 +5823,17 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) last_sector = bio_end_sector(bi); bi->bi_next = NULL; + /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ + if ((bi->bi_opf & REQ_NOWAIT) && + (conf->reshape_progress != MaxSector) && + (mddev->reshape_backwards + ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe) + : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) { + bio_wouldblock_error(bi); + if (rw == WRITE) + md_write_end(mddev); + return true; + } md_account_bio(mddev, &bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { From dd3dc5f416b7247a4b5d7bac6698be623c180572 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 25 Dec 2021 18:24:11 -0800 Subject: [PATCH 28/32] md: fix spelling of "its" Use the possessive "its" instead of the contraction "it's" in printed messages. Signed-off-by: Randy Dunlap Cc: Song Liu Cc: linux-raid@vger.kernel.org Signed-off-by: Song Liu --- drivers/md/md-cluster.c | 2 +- drivers/md/md.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 7fbd41e156c9..1c8a06b77c85 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -574,7 +574,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) int ret = 0; if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot), - "node %d received it's own msg\n", le32_to_cpu(msg->slot))) + "node %d received its own msg\n", le32_to_cpu(msg->slot))) return -1; switch (le32_to_cpu(msg->type)) { case METADATA_UPDATED: diff --git a/drivers/md/md.c b/drivers/md/md.c index c6de7d4bfe4d..3f75c5896b92 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8421,7 +8421,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes) spin_lock(&pers_lock); /* ensure module won't be unloaded */ if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { - pr_warn("can't find md-cluster module or get it's reference.\n"); + pr_warn("can't find md-cluster module or get its reference.\n"); spin_unlock(&pers_lock); return -ENOENT; } From 38640c480939d56cc8b03d58642fc5261761a697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dirk=20M=C3=BCller?= Date: Wed, 5 Jan 2022 17:38:46 +0100 Subject: [PATCH 29/32] lib/raid6: skip benchmark of non-chosen xor_syndrome functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit fe5cbc6e06c7 ("md/raid6 algorithms: delta syndrome functions") a xor_syndrome() benchmarking was added also to the raid6_choose_gen() function. However, the results of that benchmarking were intentionally discarded and did not influence the choice. It picked the xor_syndrome() variant related to the best performing gen_syndrome(). Reduce runtime of raid6_choose_gen() without modifying its outcome by only benchmarking the xor_syndrome() of the best gen_syndrome() variant. For a HZ=250 x86_64 system with avx2 and without avx512 this removes 5 out of 6 xor() benchmarks, saving 340ms of raid6 initialization time. Signed-off-by: Dirk Müller Signed-off-by: Song Liu --- lib/raid6/algos.c | 82 +++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 6d5e5000fdd7..9b7e8a837b27 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -145,12 +145,12 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) static inline const struct raid6_calls *raid6_choose_gen( void *(*const dptrs)[RAID6_TEST_DISKS], const int disks) { - unsigned long perf, bestgenperf, bestxorperf, j0, j1; + unsigned long perf, bestgenperf, j0, j1; int start = (disks>>1)-1, stop = disks-3; /* work on the second half of the disks */ const struct raid6_calls *const *algo; const struct raid6_calls *best; - for (bestgenperf = 0, bestxorperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { + for (bestgenperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { if (!best || (*algo)->prefer >= best->prefer) { if ((*algo)->valid && !(*algo)->valid()) continue; @@ -180,50 +180,48 @@ static inline const struct raid6_calls *raid6_choose_gen( pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name, (perf * HZ * (disks-2)) >> (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2)); - - if (!(*algo)->xor_syndrome) - continue; - - perf = 0; - - preempt_disable(); - j0 = jiffies; - while ((j1 = jiffies) == j0) - cpu_relax(); - while (time_before(jiffies, - j1 + (1<xor_syndrome(disks, start, stop, - PAGE_SIZE, *dptrs); - perf++; - } - preempt_enable(); - - if (best == *algo) - bestxorperf = perf; - - pr_info("raid6: %-8s xor() %5ld MB/s\n", (*algo)->name, - (perf * HZ * (disks-2)) >> - (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1)); } } - if (best) { - if (IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) { - pr_info("raid6: using algorithm %s gen() %ld MB/s\n", - best->name, - (bestgenperf * HZ * (disks-2)) >> - (20 - PAGE_SHIFT+RAID6_TIME_JIFFIES_LG2)); - if (best->xor_syndrome) - pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", - (bestxorperf * HZ * (disks-2)) >> - (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1)); - } else - pr_info("raid6: skip pq benchmark and using algorithm %s\n", - best->name); - raid6_call = *best; - } else - pr_err("raid6: Yikes! No algorithm found!\n"); + if (!best) { + pr_err("raid6: Yikes! No algorithm found!\n"); + goto out; + } + raid6_call = *best; + + if (!IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) { + pr_info("raid6: skipped pq benchmark and selected %s\n", + best->name); + goto out; + } + + pr_info("raid6: using algorithm %s gen() %ld MB/s\n", + best->name, + (bestgenperf * HZ * (disks - 2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2)); + + if (best->xor_syndrome) { + perf = 0; + + preempt_disable(); + j0 = jiffies; + while ((j1 = jiffies) == j0) + cpu_relax(); + while (time_before(jiffies, + j1 + (1 << RAID6_TIME_JIFFIES_LG2))) { + best->xor_syndrome(disks, start, stop, + PAGE_SIZE, *dptrs); + perf++; + } + preempt_enable(); + + pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", + (perf * HZ * (disks - 2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1)); + } + +out: return best; } From 36dacddbf0bdba86cd00f066b4d724157eeb63f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dirk=20M=C3=BCller?= Date: Wed, 5 Jan 2022 17:38:47 +0100 Subject: [PATCH 30/32] lib/raid6: Use strict priority ranking for pq gen() benchmarking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On x86_64, currently 3 variants of AVX512, 3 variants of AVX2 and 3 variants of SSE2 are benchmarked on initialization, taking between 144-153 jiffies. Testing across a hardware pool of various generations of intel cpus I could not find a single case where SSE2 won over AVX2 or AVX512. There are cases where AVX2 wins over AVX512 however. Change "prefer" into an integer priority field (similar to how recov selection works) to have more than one ranking level available, which is backwards compatible with existing behavior. Give AVX2/512 variants higher priority over SSE2 in order to skip SSE testing when AVX is available. in a AVX2/x86_64/HZ=250 case this saves in the order of 200ms of initialization time. Signed-off-by: Dirk Müller Acked-by: Paul Menzel Signed-off-by: Song Liu --- include/linux/raid/pq.h | 2 +- lib/raid6/algos.c | 2 +- lib/raid6/avx2.c | 8 ++++---- lib/raid6/avx512.c | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 154e954b711d..d6e5a1feb947 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -81,7 +81,7 @@ struct raid6_calls { void (*xor_syndrome)(int, int, int, size_t, void **); int (*valid)(void); /* Returns 1 if this routine set is usable */ const char *name; /* Name of this routine set */ - int prefer; /* Has special performance attribute */ + int priority; /* Relative priority ranking if non-zero */ }; /* Selected algorithm */ diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 9b7e8a837b27..39b74221f4a7 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -151,7 +151,7 @@ static inline const struct raid6_calls *raid6_choose_gen( const struct raid6_calls *best; for (bestgenperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { - if (!best || (*algo)->prefer >= best->prefer) { + if (!best || (*algo)->priority >= best->priority) { if ((*algo)->valid && !(*algo)->valid()) continue; diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c index f299476e1d76..059024234dce 100644 --- a/lib/raid6/avx2.c +++ b/lib/raid6/avx2.c @@ -132,7 +132,7 @@ const struct raid6_calls raid6_avx2x1 = { raid6_avx21_xor_syndrome, raid6_have_avx2, "avx2x1", - 1 /* Has cache hints */ + .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */ }; /* @@ -262,7 +262,7 @@ const struct raid6_calls raid6_avx2x2 = { raid6_avx22_xor_syndrome, raid6_have_avx2, "avx2x2", - 1 /* Has cache hints */ + .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */ }; #ifdef CONFIG_X86_64 @@ -465,6 +465,6 @@ const struct raid6_calls raid6_avx2x4 = { raid6_avx24_xor_syndrome, raid6_have_avx2, "avx2x4", - 1 /* Has cache hints */ + .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */ }; -#endif +#endif /* CONFIG_X86_64 */ diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c index bb684d144ee2..9c3e822e1adf 100644 --- a/lib/raid6/avx512.c +++ b/lib/raid6/avx512.c @@ -162,7 +162,7 @@ const struct raid6_calls raid6_avx512x1 = { raid6_avx5121_xor_syndrome, raid6_have_avx512, "avx512x1", - 1 /* Has cache hints */ + .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ }; /* @@ -319,7 +319,7 @@ const struct raid6_calls raid6_avx512x2 = { raid6_avx5122_xor_syndrome, raid6_have_avx512, "avx512x2", - 1 /* Has cache hints */ + .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ }; #ifdef CONFIG_X86_64 @@ -557,7 +557,7 @@ const struct raid6_calls raid6_avx512x4 = { raid6_avx5124_xor_syndrome, raid6_have_avx512, "avx512x4", - 1 /* Has cache hints */ + .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ }; #endif From 0c031fd37f69deb0cd8c43bbfcfccd62ebd7e952 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Fri, 10 Dec 2021 17:31:15 +0800 Subject: [PATCH 31/32] md: Move alloc/free acct bioset in to personality bioset acct is only needed for raid0 and raid5. Therefore, md_run only allocates it for raid0 and raid5. However, this does not cover personality takeover, which may cause uninitialized bioset. For example, the following repro steps: mdadm -CR /dev/md0 -l1 -n2 /dev/loop0 /dev/loop1 mdadm --wait /dev/md0 mkfs.xfs /dev/md0 mdadm /dev/md0 --grow -l5 mount /dev/md0 /mnt causes panic like: [ 225.933939] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 225.934903] #PF: supervisor instruction fetch in kernel mode [ 225.935639] #PF: error_code(0x0010) - not-present page [ 225.936361] PGD 0 P4D 0 [ 225.936677] Oops: 0010 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN PTI [ 225.937525] CPU: 27 PID: 1133 Comm: mount Not tainted 5.16.0-rc3+ #706 [ 225.938416] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.module_el8.4.0+547+a85d02ba 04/01/2014 [ 225.939922] RIP: 0010:0x0 [ 225.940289] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6. [ 225.941196] RSP: 0018:ffff88815897eff0 EFLAGS: 00010246 [ 225.941897] RAX: 0000000000000000 RBX: 0000000000092800 RCX: ffffffff81370a39 [ 225.942813] RDX: dffffc0000000000 RSI: 0000000000000000 RDI: 0000000000092800 [ 225.943772] RBP: 1ffff1102b12fe04 R08: fffffbfff0b43c01 R09: fffffbfff0b43c01 [ 225.944807] R10: ffffffff85a1e007 R11: fffffbfff0b43c00 R12: ffff88810eaaaf58 [ 225.945757] R13: 0000000000000000 R14: ffff88810eaaafb8 R15: ffff88815897f040 [ 225.946709] FS: 00007ff3f2505080(0000) GS:ffff888fb5e00000(0000) knlGS:0000000000000000 [ 225.947814] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 225.948556] CR2: ffffffffffffffd6 CR3: 000000015aa5a006 CR4: 0000000000370ee0 [ 225.949537] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 225.950455] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 225.951414] Call Trace: [ 225.951787] [ 225.952120] mempool_alloc+0xe5/0x250 [ 225.952625] ? mempool_resize+0x370/0x370 [ 225.953187] ? rcu_read_lock_sched_held+0xa1/0xd0 [ 225.953862] ? rcu_read_lock_bh_held+0xb0/0xb0 [ 225.954464] ? sched_clock_cpu+0x15/0x120 [ 225.955019] ? find_held_lock+0xac/0xd0 [ 225.955564] bio_alloc_bioset+0x1ed/0x2a0 [ 225.956080] ? lock_downgrade+0x3a0/0x3a0 [ 225.956644] ? bvec_alloc+0xc0/0xc0 [ 225.957135] bio_clone_fast+0x19/0x80 [ 225.957651] raid5_make_request+0x1370/0x1b70 [ 225.958286] ? sched_clock_cpu+0x15/0x120 [ 225.958797] ? __lock_acquire+0x8b2/0x3510 [ 225.959339] ? raid5_get_active_stripe+0xce0/0xce0 [ 225.959986] ? lock_is_held_type+0xd8/0x130 [ 225.960528] ? rcu_read_lock_sched_held+0xa1/0xd0 [ 225.961135] ? rcu_read_lock_bh_held+0xb0/0xb0 [ 225.961703] ? sched_clock_cpu+0x15/0x120 [ 225.962232] ? lock_release+0x27a/0x6c0 [ 225.962746] ? do_wait_intr_irq+0x130/0x130 [ 225.963302] ? lock_downgrade+0x3a0/0x3a0 [ 225.963815] ? lock_release+0x6c0/0x6c0 [ 225.964348] md_handle_request+0x342/0x530 [ 225.964888] ? set_in_sync+0x170/0x170 [ 225.965397] ? blk_queue_split+0x133/0x150 [ 225.965988] ? __blk_queue_split+0x8b0/0x8b0 [ 225.966524] ? submit_bio_checks+0x3b2/0x9d0 [ 225.967069] md_submit_bio+0x127/0x1c0 [...] Fix this by moving alloc/free of acct bioset to pers->run and pers->free. While we are on this, properly handle md_integrity_register() error in raid0_run(). Fixes: daee2024715d (md: check level before create and exit io_acct_set) Cc: stable@vger.kernel.org Acked-by: Guoqing Jiang Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md.c | 27 +++++++++++++++++---------- drivers/md/md.h | 2 ++ drivers/md/raid0.c | 38 ++++++++++++++++++++++++++++---------- drivers/md/raid5.c | 41 ++++++++++++++++++++++++++++++----------- 4 files changed, 77 insertions(+), 31 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3f75c5896b92..3f57843918ec 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5881,13 +5881,6 @@ int md_run(struct mddev *mddev) if (err) goto exit_bio_set; } - if (mddev->level != 1 && mddev->level != 10 && - !bioset_initialized(&mddev->io_acct_set)) { - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, - offsetof(struct md_io_acct, bio_clone), 0); - if (err) - goto exit_sync_set; - } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -6064,9 +6057,6 @@ bitmap_abort: module_put(pers->owner); md_bitmap_destroy(mddev); abort: - if (mddev->level != 1 && mddev->level != 10) - bioset_exit(&mddev->io_acct_set); -exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); @@ -8608,6 +8598,23 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); +int acct_bioset_init(struct mddev *mddev) +{ + int err = 0; + + if (!bioset_initialized(&mddev->io_acct_set)) + err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, + offsetof(struct md_io_acct, bio_clone), 0); + return err; +} +EXPORT_SYMBOL_GPL(acct_bioset_init); + +void acct_bioset_exit(struct mddev *mddev) +{ + bioset_exit(&mddev->io_acct_set); +} +EXPORT_SYMBOL_GPL(acct_bioset_exit); + static void md_end_io_acct(struct bio *bio) { struct md_io_acct *md_io_acct = bio->bi_private; diff --git a/drivers/md/md.h b/drivers/md/md.h index 53ea7a6961de..f1bf3625ef4c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -721,6 +721,8 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); +int acct_bioset_init(struct mddev *mddev); +void acct_bioset_exit(struct mddev *mddev); void md_account_bio(struct mddev *mddev, struct bio **bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 62c8b6adac70..b59a77b31b90 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -356,7 +356,21 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } -static void raid0_free(struct mddev *mddev, void *priv); +static void free_conf(struct mddev *mddev, struct r0conf *conf) +{ + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); + mddev->private = NULL; +} + +static void raid0_free(struct mddev *mddev, void *priv) +{ + struct r0conf *conf = priv; + + free_conf(mddev, conf); + acct_bioset_exit(mddev); +} static int raid0_run(struct mddev *mddev) { @@ -370,11 +384,16 @@ static int raid0_run(struct mddev *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; + if (acct_bioset_init(mddev)) { + pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev)); + return -ENOMEM; + } + /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { ret = create_strip_zones(mddev, &conf); if (ret < 0) - return ret; + goto exit_acct_set; mddev->private = conf; } conf = mddev->private; @@ -413,17 +432,16 @@ static int raid0_run(struct mddev *mddev) dump_zones(mddev); ret = md_integrity_register(mddev); + if (ret) + goto free; return ret; -} -static void raid0_free(struct mddev *mddev, void *priv) -{ - struct r0conf *conf = priv; - - kfree(conf->strip_zone); - kfree(conf->devlist); - kfree(conf); +free: + free_conf(mddev, conf); +exit_acct_set: + acct_bioset_exit(mddev); + return ret; } static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index beb544be9058..ffe720c73b0a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7462,12 +7462,19 @@ static int raid5_run(struct mddev *mddev) struct md_rdev *rdev; struct md_rdev *journal_dev = NULL; sector_t reshape_offset = 0; - int i; + int i, ret = 0; long long min_offset_diff = 0; int first = 1; - if (mddev_init_writes_pending(mddev) < 0) + if (acct_bioset_init(mddev)) { + pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev)); return -ENOMEM; + } + + if (mddev_init_writes_pending(mddev) < 0) { + ret = -ENOMEM; + goto exit_acct_set; + } if (mddev->recovery_cp != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", @@ -7498,7 +7505,8 @@ static int raid5_run(struct mddev *mddev) (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", mdname(mddev)); - return -EINVAL; + ret = -EINVAL; + goto exit_acct_set; } if (mddev->reshape_position != MaxSector) { @@ -7523,13 +7531,15 @@ static int raid5_run(struct mddev *mddev) if (journal_dev) { pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", mdname(mddev)); - return -EINVAL; + ret = -EINVAL; + goto exit_acct_set; } if (mddev->new_level != mddev->level) { pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", mdname(mddev)); - return -EINVAL; + ret = -EINVAL; + goto exit_acct_set; } old_disks = mddev->raid_disks - mddev->delta_disks; /* reshape_position must be on a new-stripe boundary, and one @@ -7545,7 +7555,8 @@ static int raid5_run(struct mddev *mddev) if (sector_div(here_new, chunk_sectors * new_data_disks)) { pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", mdname(mddev)); - return -EINVAL; + ret = -EINVAL; + goto exit_acct_set; } reshape_offset = here_new * chunk_sectors; /* here_new is the stripe we will write to */ @@ -7567,7 +7578,8 @@ static int raid5_run(struct mddev *mddev) else if (mddev->ro == 0) { pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", mdname(mddev)); - return -EINVAL; + ret = -EINVAL; + goto exit_acct_set; } } else if (mddev->reshape_backwards ? (here_new * chunk_sectors + min_offset_diff <= @@ -7577,7 +7589,8 @@ static int raid5_run(struct mddev *mddev) /* Reading from the same stripe as writing to - bad */ pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", mdname(mddev)); - return -EINVAL; + ret = -EINVAL; + goto exit_acct_set; } pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); /* OK, we should be able to continue; */ @@ -7601,8 +7614,10 @@ static int raid5_run(struct mddev *mddev) else conf = mddev->private; - if (IS_ERR(conf)) - return PTR_ERR(conf); + if (IS_ERR(conf)) { + ret = PTR_ERR(conf); + goto exit_acct_set; + } if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { if (!journal_dev) { @@ -7799,7 +7814,10 @@ abort: free_conf(conf); mddev->private = NULL; pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); - return -EIO; + ret = -EIO; +exit_acct_set: + acct_bioset_exit(mddev); + return ret; } static void raid5_free(struct mddev *mddev, void *priv) @@ -7807,6 +7825,7 @@ static void raid5_free(struct mddev *mddev, void *priv) struct r5conf *conf = priv; free_conf(conf); + acct_bioset_exit(mddev); mddev->to_remove = &raid5_attrs_group; } From 1745e857e73a2e29379013438ee271e9aadab2e0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 6 Jan 2022 11:03:35 +0100 Subject: [PATCH 32/32] md: use default_groups in kobj_type There are currently 2 ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the md rdev sysfs code to use default_groups field which has been the preferred way since commit aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Cc: Song Liu Cc: linux-raid@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Song Liu --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3f57843918ec..854cbf4234aa 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3608,6 +3608,7 @@ static struct attribute *rdev_default_attrs[] = { &rdev_ppl_size.attr, NULL, }; +ATTRIBUTE_GROUPS(rdev_default); static ssize_t rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { @@ -3657,7 +3658,7 @@ static const struct sysfs_ops rdev_sysfs_ops = { static struct kobj_type rdev_ktype = { .release = rdev_free, .sysfs_ops = &rdev_sysfs_ops, - .default_attrs = rdev_default_attrs, + .default_groups = rdev_default_groups, }; int md_rdev_init(struct md_rdev *rdev)