mm/memory-failure: send SIGBUS in the event of thp split fail
While handling hwpoison in a THP page, it is possible that try_to_split_thp_page() fails. For example, when the THP page has been RDMA pinned. At this point, the kernel cannot isolate the poisoned THP page, all it could do is to send a SIGBUS to the user process with meaningful payload to give user-level recovery a chance. Link: https://lkml.kernel.org/r/20240524215306.2705454-6-jane.chu@oracle.com Signed-off-by: Jane Chu <jane.chu@oracle.com> Reviewed-by: Oscar Salvador <osalvador@suse.de> Acked-by: Miaohe Lin <linmiaohe@huawei.com> Cc: Naoya Horiguchi <nao.horiguchi@gmail.com> Cc: Oscar Salvador <oalvador@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
9b0ab153d7
commit
1a3798dece
@ -1706,7 +1706,12 @@ static int identify_page_state(unsigned long pfn, struct page *p,
|
||||
return page_action(ps, p, pfn);
|
||||
}
|
||||
|
||||
static int try_to_split_thp_page(struct page *page)
|
||||
/*
|
||||
* When 'release' is 'false', it means that if thp split has failed,
|
||||
* there is still more to do, hence the page refcount we took earlier
|
||||
* is still needed.
|
||||
*/
|
||||
static int try_to_split_thp_page(struct page *page, bool release)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -1714,7 +1719,7 @@ static int try_to_split_thp_page(struct page *page)
|
||||
ret = split_huge_page(page);
|
||||
unlock_page(page);
|
||||
|
||||
if (unlikely(ret))
|
||||
if (ret && release)
|
||||
put_page(page);
|
||||
|
||||
return ret;
|
||||
@ -2186,6 +2191,22 @@ out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling condition is as such: thp split failed, page might have
|
||||
* been RDMA pinned, not much can be done for recovery.
|
||||
* But a SIGBUS should be delivered with vaddr provided so that the user
|
||||
* application has a chance to recover. Also, application processes'
|
||||
* election for MCE early killed will be honored.
|
||||
*/
|
||||
static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
|
||||
struct folio *folio)
|
||||
{
|
||||
LIST_HEAD(tokill);
|
||||
|
||||
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
|
||||
kill_procs(&tokill, true, pfn, flags);
|
||||
}
|
||||
|
||||
/**
|
||||
* memory_failure - Handle memory failure of a page.
|
||||
* @pfn: Page Number of the corrupted page
|
||||
@ -2327,8 +2348,11 @@ try_again:
|
||||
* page is a valid handlable page.
|
||||
*/
|
||||
folio_set_has_hwpoisoned(folio);
|
||||
if (try_to_split_thp_page(p) < 0) {
|
||||
res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
|
||||
if (try_to_split_thp_page(p, false) < 0) {
|
||||
res = -EHWPOISON;
|
||||
kill_procs_now(p, pfn, flags, folio);
|
||||
put_page(p);
|
||||
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_FAILED);
|
||||
goto unlock_mutex;
|
||||
}
|
||||
VM_BUG_ON_PAGE(!page_count(p), p);
|
||||
@ -2709,7 +2733,7 @@ static int soft_offline_in_use_page(struct page *page)
|
||||
};
|
||||
|
||||
if (!huge && folio_test_large(folio)) {
|
||||
if (try_to_split_thp_page(page)) {
|
||||
if (try_to_split_thp_page(page, true)) {
|
||||
pr_info("soft offline: %#lx: thp split failed\n", pfn);
|
||||
return -EBUSY;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user