From a1ca8295ee53a2fc57085fae26df37228c655791 Mon Sep 17 00:00:00 2001 From: Wang chaodong Date: Fri, 20 Oct 2023 16:51:06 +0800 Subject: [PATCH 01/12] PM: hibernate: Drop unnecessary local variable initialization It is not necessary to intialize the error variable in create_basic_memory_bitmaps(), because it is only read after being assigned a value. Signed-off-by: Wang chaodong [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 50a15408c3fc..71b2f12ed3b5 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1119,7 +1119,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm) int create_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - int error = 0; + int error; if (forbidden_pages_map && free_pages_map) return 0; From bbeaa4691fa8682e2fe2e87f28d5fce39805fa68 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Fri, 27 Oct 2023 09:55:33 +0800 Subject: [PATCH 02/12] PM: hibernate: Do not initialize error in swap_write_page() 'error' first receives the function result before it is used, and it does not need to be assigned a value during definition. Signed-off-by: Li zeming [ rjw: Subject rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a2cb0babb5ec..68973ca2cf07 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -451,7 +451,7 @@ err_close: static int swap_write_page(struct swap_map_handle *handle, void *buf, struct hib_bio_batch *hb) { - int error = 0; + int error; sector_t offset; if (!handle->cur) From 4ac934b1aaa99e00ca25875d55094a4fe34e212d Mon Sep 17 00:00:00 2001 From: Li zeming Date: Tue, 24 Oct 2023 10:04:34 +0800 Subject: [PATCH 03/12] PM: hibernate: Do not initialize error in snapshot_write_next() The error variable in snapshot_write_next() gets a value before it is used, so don't initialize it to 0 upfront. Signed-off-by: Li zeming [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 71b2f12ed3b5..e3e8f1c6e75f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2778,7 +2778,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; - int error = 0; + int error; next: /* Check if we have already loaded the entire image */ From 0c4cae1bc00d31c78858c184ede351baea232bdb Mon Sep 17 00:00:00 2001 From: Chris Feng Date: Wed, 13 Dec 2023 16:32:51 +0800 Subject: [PATCH 04/12] PM: hibernate: Avoid missing wakeup events during hibernation Wakeup events that occur in the hibernation process's hibernation_platform_enter() cannot wake up the system. Although the current hibernation framework will execute part of the recovery process after a wakeup event occurs, it ultimately performs a shutdown operation because the system does not check the return value of hibernation_platform_enter(). In short, if a wakeup event occurs before putting the system into the final low-power state, it will be missed. To solve this problem, check the return value of hibernation_platform_enter(). When it returns -EAGAIN or -EBUSY (indicate the occurrence of a wakeup event), execute the hibernation recovery process, discard the previously saved image, and ultimately return to the working state. Signed-off-by: Chris Feng [ rjw: Rephrase the message printed when going back to the working state ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 10 ++++++++-- kernel/power/power.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index dee341ae4ace..4b0b7cf2e019 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -642,9 +642,9 @@ int hibernation_platform_enter(void) */ static void power_down(void) { -#ifdef CONFIG_SUSPEND int error; +#ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { error = suspend_devices_and_enter(mem_sleep_current); if (error) { @@ -667,7 +667,13 @@ static void power_down(void) kernel_restart(NULL); break; case HIBERNATION_PLATFORM: - hibernation_platform_enter(); + error = hibernation_platform_enter(); + if (error == -EAGAIN || error == -EBUSY) { + swsusp_unmark(); + events_check_enabled = false; + pr_info("Wakeup event detected during hibernation, rolling back.\n"); + return; + } fallthrough; case HIBERNATION_SHUTDOWN: if (kernel_can_power_off()) diff --git a/kernel/power/power.h b/kernel/power/power.h index 17fd9aaaf084..8499a39c62f4 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -175,6 +175,8 @@ extern int swsusp_write(unsigned int flags); void swsusp_close(void); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); +#else +static inline int swsusp_unmark(void) { return 0; } #endif struct __kernel_old_timeval; From 71cd7e80cfde548959952eac7063aeaea1f2e1c6 Mon Sep 17 00:00:00 2001 From: Hongchen Zhang Date: Thu, 16 Nov 2023 08:56:09 +0800 Subject: [PATCH 05/12] PM: hibernate: Enforce ordering during image compression/decompression An S4 (suspend to disk) test on the LoongArch 3A6000 platform sometimes fails with the following error messaged in the dmesg log: Invalid LZO compressed length That happens because when compressing/decompressing the image, the synchronization between the control thread and the compress/decompress/crc thread is based on a relaxed ordering interface, which is unreliable, and the following situation may occur: CPU 0 CPU 1 save_image_lzo lzo_compress_threadfn atomic_set(&d->stop, 1); atomic_read(&data[thr].stop) data[thr].cmp = data[thr].cmp_len; WRITE data[thr].cmp_len Then CPU0 gets a stale cmp_len and writes it to disk. During resume from S4, wrong cmp_len is loaded. To maintain data consistency between the two threads, use the acquire/release variants of atomic set and read operations. Fixes: 081a9d043c98 ("PM / Hibernate: Improve performance of LZO/plain hibernation, checksum image") Cc: All applicable Signed-off-by: Hongchen Zhang Co-developed-by: Weihao Li Signed-off-by: Weihao Li [ rjw: Subject rewrite and changelog edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 68973ca2cf07..975e7195573b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -606,11 +606,11 @@ static int crc32_threadfn(void *data) unsigned i; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -619,7 +619,7 @@ static int crc32_threadfn(void *data) for (i = 0; i < d->run_threads; i++) *d->crc32 = crc32_le(*d->crc32, d->unc[i], *d->unc_len[i]); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -649,12 +649,12 @@ static int lzo_compress_threadfn(void *data) struct cmp_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -663,7 +663,7 @@ static int lzo_compress_threadfn(void *data) d->ret = lzo1x_1_compress(d->unc, d->unc_len, d->cmp + LZO_HEADER, &d->cmp_len, d->wrk); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -798,7 +798,7 @@ static int save_image_lzo(struct swap_map_handle *handle, data[thr].unc_len = off; - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } @@ -806,12 +806,12 @@ static int save_image_lzo(struct swap_map_handle *handle, break; crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; @@ -850,7 +850,7 @@ static int save_image_lzo(struct swap_map_handle *handle, } } - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } @@ -1132,12 +1132,12 @@ static int lzo_decompress_threadfn(void *data) struct dec_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -1150,7 +1150,7 @@ static int lzo_decompress_threadfn(void *data) flush_icache_range((unsigned long)d->unc, (unsigned long)d->unc + d->unc_len); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -1335,7 +1335,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); crc->run_threads = 0; } @@ -1371,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle, pg = 0; } - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } @@ -1390,7 +1390,7 @@ static int load_image_lzo(struct swap_map_handle *handle, for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; @@ -1421,7 +1421,7 @@ static int load_image_lzo(struct swap_map_handle *handle, ret = snapshot_write_next(snapshot); if (ret <= 0) { crc->run_threads = thr + 1; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); goto out_finish; } @@ -1429,13 +1429,13 @@ static int load_image_lzo(struct swap_map_handle *handle, } crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); } out_finish: if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } stop = ktime_get(); From 489c693bd04a2308865dc50f37bd0b5f6ad52deb Mon Sep 17 00:00:00 2001 From: Chen Haonan Date: Tue, 19 Dec 2023 21:06:25 +0800 Subject: [PATCH 06/12] PM: hibernate: Use kmap_local_page() in copy_data_page() kmap_atomic() has been deprecated in favor of kmap_local_page(). kmap_atomic() disables page-faults and preemption (the latter only for !PREEMPT_RT kernels).The code between the mapping and un-mapping in this patch does not depend on the above-mentioned side effects.So simply replaced kmap_atomic() with kmap_local_page(). Signed-off-by: Chen Haonan [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index e3e8f1c6e75f..5c96ff067c64 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1487,11 +1487,11 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page); - dst = kmap_atomic(d_page); + src = kmap_local_page(s_page); + dst = kmap_local_page(d_page); zeros_only = do_copy_page(dst, src); - kunmap_atomic(dst); - kunmap_atomic(src); + kunmap_local(dst); + kunmap_local(src); } else { if (PageHighMem(d_page)) { /* @@ -1499,9 +1499,9 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * data modified by kmap_atomic() */ zeros_only = safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page); + dst = kmap_local_page(d_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); } else { zeros_only = safe_copy_page(page_address(d_page), s_page); } From 4bbf0b6a64455c95586caf130e374586caef9986 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Tue, 12 Dec 2023 22:00:43 +0800 Subject: [PATCH 07/12] Documentation: PM: Adjust freezing-of-tasks.rst to the freezer changes The core freezer logic has been modified by commit f5d39b020809 ("freezer,sched: Rewrite core freezer logic"), so adjust the documentation to reflect the new code. The main changes include: - Drop references to PF_FROZEN and PF_FREEZER_SKIP - Describe TASK_FROZEN, TASK_FREEZABLE and __TASK_FREEZABLE_UNSAFE - Replace system_freezing_cnt with freezer_active - Use a different example for the loop of a freezable kernel thread, since the old code is gone gone Signed-off-by: Kevin Hao [ rjw: Subject and changelog edits, doc text adjustments ] Signed-off-by: Rafael J. Wysocki --- Documentation/power/freezing-of-tasks.rst | 81 +++++++++++++---------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/Documentation/power/freezing-of-tasks.rst b/Documentation/power/freezing-of-tasks.rst index 53b6a56c4635..df9755bfbd94 100644 --- a/Documentation/power/freezing-of-tasks.rst +++ b/Documentation/power/freezing-of-tasks.rst @@ -14,27 +14,28 @@ architectures). II. How does it work? ===================== -There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN -and PF_FREEZER_SKIP (the last one is auxiliary). The tasks that have -PF_NOFREEZE unset (all user space processes and some kernel threads) are -regarded as 'freezable' and treated in a special way before the system enters a -suspend state as well as before a hibernation image is created (in what follows -we only consider hibernation, but the description also applies to suspend). +There is one per-task flag (PF_NOFREEZE) and three per-task states +(TASK_FROZEN, TASK_FREEZABLE and __TASK_FREEZABLE_UNSAFE) used for that. +The tasks that have PF_NOFREEZE unset (all user space tasks and some kernel +threads) are regarded as 'freezable' and treated in a special way before the +system enters a sleep state as well as before a hibernation image is created +(hibernation is directly covered by what follows, but the description applies +to system-wide suspend too). Namely, as the first step of the hibernation procedure the function freeze_processes() (defined in kernel/power/process.c) is called. A system-wide -variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate -whether the system is to undergo a freezing operation. And freeze_processes() -sets this variable. After this, it executes try_to_freeze_tasks() that sends a -fake signal to all user space processes, and wakes up all the kernel threads. -All freezable tasks must react to that by calling try_to_freeze(), which -results in a call to __refrigerator() (defined in kernel/freezer.c), which sets -the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes -it loop until PF_FROZEN is cleared for it. Then, we say that the task is -'frozen' and therefore the set of functions handling this mechanism is referred -to as 'the freezer' (these functions are defined in kernel/power/process.c, -kernel/freezer.c & include/linux/freezer.h). User space processes are generally -frozen before kernel threads. +static key freezer_active (as opposed to a per-task flag or state) is used to +indicate whether the system is to undergo a freezing operation. And +freeze_processes() sets this static key. After this, it executes +try_to_freeze_tasks() that sends a fake signal to all user space processes, and +wakes up all the kernel threads. All freezable tasks must react to that by +calling try_to_freeze(), which results in a call to __refrigerator() (defined +in kernel/freezer.c), which changes the task's state to TASK_FROZEN, and makes +it loop until it is woken by an explicit TASK_FROZEN wakeup. Then, that task +is regarded as 'frozen' and so the set of functions handling this mechanism is +referred to as 'the freezer' (these functions are defined in +kernel/power/process.c, kernel/freezer.c & include/linux/freezer.h). User space +tasks are generally frozen before kernel threads. __refrigerator() must not be called directly. Instead, use the try_to_freeze() function (defined in include/linux/freezer.h), that checks @@ -43,31 +44,40 @@ if the task is to be frozen and makes the task enter __refrigerator(). For user space processes try_to_freeze() is called automatically from the signal-handling code, but the freezable kernel threads need to call it explicitly in suitable places or use the wait_event_freezable() or -wait_event_freezable_timeout() macros (defined in include/linux/freezer.h) -that combine interruptible sleep with checking if the task is to be frozen and -calling try_to_freeze(). The main loop of a freezable kernel thread may look +wait_event_freezable_timeout() macros (defined in include/linux/wait.h) +that put the task to sleep (TASK_INTERRUPTIBLE) or freeze it (TASK_FROZEN) if +freezer_active is set. The main loop of a freezable kernel thread may look like the following one:: set_freezable(); - do { - hub_events(); - wait_event_freezable(khubd_wait, - !list_empty(&hub_event_list) || - kthread_should_stop()); - } while (!kthread_should_stop() || !list_empty(&hub_event_list)); -(from drivers/usb/core/hub.c::hub_thread()). + while (true) { + struct task_struct *tsk = NULL; -If a freezable kernel thread fails to call try_to_freeze() after the freezer has -initiated a freezing operation, the freezing of tasks will fail and the entire -hibernation operation will be cancelled. For this reason, freezable kernel -threads must call try_to_freeze() somewhere or use one of the + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); + spin_lock_irq(&oom_reaper_lock); + if (oom_reaper_list != NULL) { + tsk = oom_reaper_list; + oom_reaper_list = tsk->oom_reaper_list; + } + spin_unlock_irq(&oom_reaper_lock); + + if (tsk) + oom_reap_task(tsk); + } + +(from mm/oom_kill.c::oom_reaper()). + +If a freezable kernel thread is not put to the frozen state after the freezer +has initiated a freezing operation, the freezing of tasks will fail and the +entire system-wide transition will be cancelled. For this reason, freezable +kernel threads must call try_to_freeze() somewhere or use one of the wait_event_freezable() and wait_event_freezable_timeout() macros. After the system memory state has been restored from a hibernation image and devices have been reinitialized, the function thaw_processes() is called in -order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that -have been frozen leave __refrigerator() and continue running. +order to wake up each frozen task. Then, the tasks that have been frozen leave +__refrigerator() and continue running. Rationale behind the functions dealing with freezing and thawing of tasks @@ -96,7 +106,8 @@ III. Which kernel threads are freezable? Kernel threads are not freezable by default. However, a kernel thread may clear PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE directly is not allowed). From this point it is regarded as freezable -and must call try_to_freeze() in a suitable place. +and must call try_to_freeze() or variants of wait_event_freezable() in a +suitable place. IV. Why do we do that? ====================== From e0f4bd26e29bf6162cdc9dc6fb7522bde7b74d07 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Wed, 20 Dec 2023 08:35:35 +0800 Subject: [PATCH 08/12] PM: sleep: Remove obsolete comment from unlock_system_sleep() With the freezer changes introduced by commit f5d39b020809 ("freezer,sched: Rewrite core freezer logic"), the comment in unlock_system_sleep() has become obsolete, there is no need to retain it. Signed-off-by: Kevin Hao Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/kernel/power/main.c b/kernel/power/main.c index f6425ae3e8b0..b1ae9b677d03 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -60,22 +60,6 @@ EXPORT_SYMBOL_GPL(lock_system_sleep); void unlock_system_sleep(unsigned int flags) { - /* - * Don't use freezer_count() because we don't want the call to - * try_to_freeze() here. - * - * Reason: - * Fundamentally, we just don't need it, because freezing condition - * doesn't come into effect until we release the - * system_transition_mutex lock, since the freezer always works with - * system_transition_mutex held. - * - * More importantly, in the case of hibernation, - * unlock_system_sleep() gets called in snapshot_read() and - * snapshot_write() when the freezing condition is still in effect. - * Which means, if we use try_to_freeze() here, it would make them - * enter the refrigerator, thus causing hibernation to lockup. - */ if (!(flags & PF_NOFREEZE)) current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); From dadce3fbaf10250b35d540caff475ff93b259de0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Dec 2023 22:02:46 -0800 Subject: [PATCH 09/12] PM: hibernate: Repair excess function parameter description warning Function swsusp_close() does not have any parameters, so remove the description of parameter @exclusive to prevent this warning. swap.c:1573: warning: Excess function parameter 'exclusive' description in 'swsusp_close' Signed-off-by: Randy Dunlap [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 975e7195573b..6053ddddaf65 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1566,7 +1566,6 @@ put: /** * swsusp_close - close resume device. - * @exclusive: Close the resume device which is exclusively opened. */ void swsusp_close(void) From 6aa09a5bccd8e224d917afdb4c278fc66aacde4d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 27 Dec 2023 21:37:02 +0100 Subject: [PATCH 10/12] async: Split async_schedule_node_domain() In preparation for subsequent changes, split async_schedule_node_domain() in two pieces so as to allow the bottom part of it to be called from a somewhat different code path. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Tested-by: Youngmin Nam Reviewed-by: Ulf Hansson --- kernel/async.c | 56 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/kernel/async.c b/kernel/async.c index b2c4ba5686ee..cffe6b4cff9f 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -145,6 +145,39 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } +static async_cookie_t __async_schedule_node_domain(async_func_t func, + void *data, int node, + struct async_domain *domain, + struct async_entry *entry) +{ + async_cookie_t newcookie; + unsigned long flags; + + INIT_LIST_HEAD(&entry->domain_list); + INIT_LIST_HEAD(&entry->global_list); + INIT_WORK(&entry->work, async_run_entry_fn); + entry->func = func; + entry->data = data; + entry->domain = domain; + + spin_lock_irqsave(&async_lock, flags); + + /* allocate cookie and queue */ + newcookie = entry->cookie = next_cookie++; + + list_add_tail(&entry->domain_list, &domain->pending); + if (domain->registered) + list_add_tail(&entry->global_list, &async_global_pending); + + atomic_inc(&entry_count); + spin_unlock_irqrestore(&async_lock, flags); + + /* schedule for execution */ + queue_work_node(node, system_unbound_wq, &entry->work); + + return newcookie; +} + /** * async_schedule_node_domain - NUMA specific version of async_schedule_domain * @func: function to execute asynchronously @@ -186,29 +219,8 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, func(data, newcookie); return newcookie; } - INIT_LIST_HEAD(&entry->domain_list); - INIT_LIST_HEAD(&entry->global_list); - INIT_WORK(&entry->work, async_run_entry_fn); - entry->func = func; - entry->data = data; - entry->domain = domain; - spin_lock_irqsave(&async_lock, flags); - - /* allocate cookie and queue */ - newcookie = entry->cookie = next_cookie++; - - list_add_tail(&entry->domain_list, &domain->pending); - if (domain->registered) - list_add_tail(&entry->global_list, &async_global_pending); - - atomic_inc(&entry_count); - spin_unlock_irqrestore(&async_lock, flags); - - /* schedule for execution */ - queue_work_node(node, system_unbound_wq, &entry->work); - - return newcookie; + return __async_schedule_node_domain(func, data, node, domain, entry); } EXPORT_SYMBOL_GPL(async_schedule_node_domain); From 7d4b5d7a37bdd63a5a3371b988744b060d5bb86f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 27 Dec 2023 21:38:23 +0100 Subject: [PATCH 11/12] async: Introduce async_schedule_dev_nocall() In preparation for subsequent changes, introduce a specialized variant of async_schedule_dev() that will not invoke the argument function synchronously when it cannot be scheduled for asynchronous execution. The new function, async_schedule_dev_nocall(), will be used for fixing possible deadlocks in the system-wide power management core code. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka for the series. Tested-by: Youngmin Nam Reviewed-by: Ulf Hansson --- include/linux/async.h | 2 ++ kernel/async.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/include/linux/async.h b/include/linux/async.h index cce4ad31e8fc..33c9ff4afb49 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -90,6 +90,8 @@ async_schedule_dev(async_func_t func, struct device *dev) return async_schedule_node(func, dev, dev_to_node(dev)); } +bool async_schedule_dev_nocall(async_func_t func, struct device *dev); + /** * async_schedule_dev_domain - A device specific version of async_schedule_domain * @func: function to execute asynchronously diff --git a/kernel/async.c b/kernel/async.c index cffe6b4cff9f..673bba6bdf3a 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -243,6 +243,35 @@ async_cookie_t async_schedule_node(async_func_t func, void *data, int node) } EXPORT_SYMBOL_GPL(async_schedule_node); +/** + * async_schedule_dev_nocall - A simplified variant of async_schedule_dev() + * @func: function to execute asynchronously + * @dev: device argument to be passed to function + * + * @dev is used as both the argument for the function and to provide NUMA + * context for where to run the function. + * + * If the asynchronous execution of @func is scheduled successfully, return + * true. Otherwise, do nothing and return false, unlike async_schedule_dev() + * that will run the function synchronously then. + */ +bool async_schedule_dev_nocall(async_func_t func, struct device *dev) +{ + struct async_entry *entry; + + entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL); + + /* Give up if there is no memory or too much work. */ + if (!entry || atomic_read(&entry_count) > MAX_WORK) { + kfree(entry); + return false; + } + + __async_schedule_node_domain(func, dev, dev_to_node(dev), + &async_dfl_domain, entry); + return true; +} + /** * async_synchronize_full - synchronize all asynchronous function calls * From 7839d0078e0d5e6cc2fa0b0dfbee71de74f1e557 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 27 Dec 2023 21:41:06 +0100 Subject: [PATCH 12/12] PM: sleep: Fix possible deadlocks in core system-wide PM code It is reported that in low-memory situations the system-wide resume core code deadlocks, because async_schedule_dev() executes its argument function synchronously if it cannot allocate memory (and not only in that case) and that function attempts to acquire a mutex that is already held. Executing the argument function synchronously from within dpm_async_fn() may also be problematic for ordering reasons (it may cause a consumer device's resume callback to be invoked before a requisite supplier device's one, for example). Address this by changing the code in question to use async_schedule_dev_nocall() for scheduling the asynchronous execution of device suspend and resume functions and to directly run them synchronously if async_schedule_dev_nocall() returns false. Link: https://lore.kernel.org/linux-pm/ZYvjiqX6EsL15moe@perf/ Reported-by: Youngmin Nam Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Tested-by: Youngmin Nam Reviewed-by: Ulf Hansson Cc: 5.7+ # 5.7+: 6aa09a5bccd8 async: Split async_schedule_node_domain() Cc: 5.7+ # 5.7+: 7d4b5d7a37bd async: Introduce async_schedule_dev_nocall() Cc: 5.7+ # 5.7+ --- drivers/base/power/main.c | 148 ++++++++++++++++++-------------------- 1 file changed, 68 insertions(+), 80 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index f85f3515c258..9c5a5f4dba5a 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -579,7 +579,7 @@ bool dev_pm_skip_resume(struct device *dev) } /** - * device_resume_noirq - Execute a "noirq resume" callback for given device. + * __device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. @@ -587,7 +587,7 @@ bool dev_pm_skip_resume(struct device *dev) * The driver of @dev will not receive interrupts while this function is being * executed. */ -static int device_resume_noirq(struct device *dev, pm_message_t state, bool async) +static void __device_resume_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; @@ -655,7 +655,13 @@ Skip: Out: complete_all(&dev->power.completion); TRACE_RESUME(error); - return error; + + if (error) { + suspend_stats.failed_resume_noirq++; + dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); + dpm_save_failed_dev(dev_name(dev)); + pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); + } } static bool is_async(struct device *dev) @@ -668,11 +674,15 @@ static bool dpm_async_fn(struct device *dev, async_func_t func) { reinit_completion(&dev->power.completion); - if (is_async(dev)) { - get_device(dev); - async_schedule_dev(func, dev); + if (!is_async(dev)) + return false; + + get_device(dev); + + if (async_schedule_dev_nocall(func, dev)) return true; - } + + put_device(dev); return false; } @@ -680,15 +690,19 @@ static bool dpm_async_fn(struct device *dev, async_func_t func) static void async_resume_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; - int error; - - error = device_resume_noirq(dev, pm_transition, true); - if (error) - pm_dev_err(dev, pm_transition, " async", error); + __device_resume_noirq(dev, pm_transition, true); put_device(dev); } +static void device_resume_noirq(struct device *dev) +{ + if (dpm_async_fn(dev, async_resume_noirq)) + return; + + __device_resume_noirq(dev, pm_transition, false); +} + static void dpm_noirq_resume_devices(pm_message_t state) { struct device *dev; @@ -698,14 +712,6 @@ static void dpm_noirq_resume_devices(pm_message_t state) mutex_lock(&dpm_list_mtx); pm_transition = state; - /* - * Advanced the async threads upfront, - * in case the starting of async threads is - * delayed by non-async resuming devices. - */ - list_for_each_entry(dev, &dpm_noirq_list, power.entry) - dpm_async_fn(dev, async_resume_noirq); - while (!list_empty(&dpm_noirq_list)) { dev = to_device(dpm_noirq_list.next); get_device(dev); @@ -713,17 +719,7 @@ static void dpm_noirq_resume_devices(pm_message_t state) mutex_unlock(&dpm_list_mtx); - if (!is_async(dev)) { - int error; - - error = device_resume_noirq(dev, state, false); - if (error) { - suspend_stats.failed_resume_noirq++; - dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); - dpm_save_failed_dev(dev_name(dev)); - pm_dev_err(dev, state, " noirq", error); - } - } + device_resume_noirq(dev); put_device(dev); @@ -751,14 +747,14 @@ void dpm_resume_noirq(pm_message_t state) } /** - * device_resume_early - Execute an "early resume" callback for given device. + * __device_resume_early - Execute an "early resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ -static int device_resume_early(struct device *dev, pm_message_t state, bool async) +static void __device_resume_early(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; @@ -811,21 +807,31 @@ Out: pm_runtime_enable(dev); complete_all(&dev->power.completion); - return error; + + if (error) { + suspend_stats.failed_resume_early++; + dpm_save_failed_step(SUSPEND_RESUME_EARLY); + dpm_save_failed_dev(dev_name(dev)); + pm_dev_err(dev, state, async ? " async early" : " early", error); + } } static void async_resume_early(void *data, async_cookie_t cookie) { struct device *dev = data; - int error; - - error = device_resume_early(dev, pm_transition, true); - if (error) - pm_dev_err(dev, pm_transition, " async", error); + __device_resume_early(dev, pm_transition, true); put_device(dev); } +static void device_resume_early(struct device *dev) +{ + if (dpm_async_fn(dev, async_resume_early)) + return; + + __device_resume_early(dev, pm_transition, false); +} + /** * dpm_resume_early - Execute "early resume" callbacks for all devices. * @state: PM transition of the system being carried out. @@ -839,14 +845,6 @@ void dpm_resume_early(pm_message_t state) mutex_lock(&dpm_list_mtx); pm_transition = state; - /* - * Advanced the async threads upfront, - * in case the starting of async threads is - * delayed by non-async resuming devices. - */ - list_for_each_entry(dev, &dpm_late_early_list, power.entry) - dpm_async_fn(dev, async_resume_early); - while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.next); get_device(dev); @@ -854,17 +852,7 @@ void dpm_resume_early(pm_message_t state) mutex_unlock(&dpm_list_mtx); - if (!is_async(dev)) { - int error; - - error = device_resume_early(dev, state, false); - if (error) { - suspend_stats.failed_resume_early++; - dpm_save_failed_step(SUSPEND_RESUME_EARLY); - dpm_save_failed_dev(dev_name(dev)); - pm_dev_err(dev, state, " early", error); - } - } + device_resume_early(dev); put_device(dev); @@ -888,12 +876,12 @@ void dpm_resume_start(pm_message_t state) EXPORT_SYMBOL_GPL(dpm_resume_start); /** - * device_resume - Execute "resume" callbacks for given device. + * __device_resume - Execute "resume" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. */ -static int device_resume(struct device *dev, pm_message_t state, bool async) +static void __device_resume(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; @@ -975,20 +963,30 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) TRACE_RESUME(error); - return error; + if (error) { + suspend_stats.failed_resume++; + dpm_save_failed_step(SUSPEND_RESUME); + dpm_save_failed_dev(dev_name(dev)); + pm_dev_err(dev, state, async ? " async" : "", error); + } } static void async_resume(void *data, async_cookie_t cookie) { struct device *dev = data; - int error; - error = device_resume(dev, pm_transition, true); - if (error) - pm_dev_err(dev, pm_transition, " async", error); + __device_resume(dev, pm_transition, true); put_device(dev); } +static void device_resume(struct device *dev) +{ + if (dpm_async_fn(dev, async_resume)) + return; + + __device_resume(dev, pm_transition, false); +} + /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. @@ -1008,27 +1006,17 @@ void dpm_resume(pm_message_t state) pm_transition = state; async_error = 0; - list_for_each_entry(dev, &dpm_suspended_list, power.entry) - dpm_async_fn(dev, async_resume); - while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); + get_device(dev); - if (!is_async(dev)) { - int error; - mutex_unlock(&dpm_list_mtx); + mutex_unlock(&dpm_list_mtx); - error = device_resume(dev, state, false); - if (error) { - suspend_stats.failed_resume++; - dpm_save_failed_step(SUSPEND_RESUME); - dpm_save_failed_dev(dev_name(dev)); - pm_dev_err(dev, state, "", error); - } + device_resume(dev); + + mutex_lock(&dpm_list_mtx); - mutex_lock(&dpm_list_mtx); - } if (!list_empty(&dev->power.entry)) list_move_tail(&dev->power.entry, &dpm_prepared_list);