Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6
* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (22 commits) Add _addr_lsb field to ia64 siginfo Fix migration.c compilation on s390 HWPOISON: Remove retry loop for try_to_unmap HWPOISON: Turn addr_valid from bitfield into char HWPOISON: Disable DEBUG by default HWPOISON: Convert pr_debugs to pr_info HWPOISON: Improve comments in memory-failure.c x86: HWPOISON: Report correct address granuality for huge hwpoison faults Encode huge page size for VM_FAULT_HWPOISON errors Fix build error with !CONFIG_MIGRATION hugepage: move is_hugepage_on_freelist inside ifdef to avoid warning Clean up __page_set_anon_rmap HWPOISON, hugetlb: fix unpoison for hugepage HWPOISON, hugetlb: soft offlining for hugepage HWPOSION, hugetlb: recover from free hugepage error when !MF_COUNT_INCREASED hugetlb: move refcounting in hugepage allocation inside hugetlb_lock HWPOISON, hugetlb: add free check to dequeue_hwpoison_huge_page() hugetlb: hugepage migration core hugetlb: redefine hugepage copy functions hugetlb: add allocate function for hugepage migration ...
This commit is contained in:
commit
f1ebdd60cc
@ -62,6 +62,7 @@ typedef struct siginfo {
|
||||
int _imm; /* immediate value for "break" */
|
||||
unsigned int _flags; /* see below */
|
||||
unsigned long _isr; /* isr */
|
||||
short _addr_lsb; /* lsb of faulting address */
|
||||
} _sigfault;
|
||||
|
||||
/* SIGPOLL */
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <linux/kprobes.h> /* __kprobes, ... */
|
||||
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
|
||||
#include <linux/perf_event.h> /* perf_sw_event */
|
||||
#include <linux/hugetlb.h> /* hstate_index_to_shift */
|
||||
|
||||
#include <asm/traps.h> /* dotraplinkage, ... */
|
||||
#include <asm/pgalloc.h> /* pgd_*(), ... */
|
||||
@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
|
||||
|
||||
static void
|
||||
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
|
||||
struct task_struct *tsk)
|
||||
struct task_struct *tsk, int fault)
|
||||
{
|
||||
unsigned lsb = 0;
|
||||
siginfo_t info;
|
||||
|
||||
info.si_signo = si_signo;
|
||||
info.si_errno = 0;
|
||||
info.si_code = si_code;
|
||||
info.si_addr = (void __user *)address;
|
||||
info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
|
||||
if (fault & VM_FAULT_HWPOISON_LARGE)
|
||||
lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
|
||||
if (fault & VM_FAULT_HWPOISON)
|
||||
lsb = PAGE_SHIFT;
|
||||
info.si_addr_lsb = lsb;
|
||||
|
||||
force_sig_info(si_signo, &info, tsk);
|
||||
}
|
||||
@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
|
||||
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
|
||||
tsk->thread.trap_no = 14;
|
||||
|
||||
force_sig_info_fault(SIGSEGV, si_code, address, tsk);
|
||||
force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
|
||||
|
||||
return;
|
||||
}
|
||||
@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
|
||||
tsk->thread.trap_no = 14;
|
||||
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
if (fault & VM_FAULT_HWPOISON) {
|
||||
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
|
||||
printk(KERN_ERR
|
||||
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
|
||||
tsk->comm, tsk->pid, address);
|
||||
code = BUS_MCEERR_AR;
|
||||
}
|
||||
#endif
|
||||
force_sig_info_fault(SIGBUS, code, address, tsk);
|
||||
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
|
||||
}
|
||||
|
||||
static noinline void
|
||||
@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
|
||||
if (fault & VM_FAULT_OOM) {
|
||||
out_of_memory(regs, error_code, address);
|
||||
} else {
|
||||
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
|
||||
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
|
||||
VM_FAULT_HWPOISON_LARGE))
|
||||
do_sigbus(regs, error_code, address, fault);
|
||||
else
|
||||
BUG();
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include <linux/statfs.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/magic.h>
|
||||
#include <linux/migrate.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int hugetlbfs_migrate_page(struct address_space *mapping,
|
||||
struct page *newpage, struct page *page)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
|
||||
if (rc)
|
||||
return rc;
|
||||
migrate_page_copy(newpage, page);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
|
||||
{
|
||||
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
|
||||
@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
|
||||
.write_begin = hugetlbfs_write_begin,
|
||||
.write_end = hugetlbfs_write_end,
|
||||
.set_page_dirty = hugetlbfs_set_page_dirty,
|
||||
.migratepage = hugetlbfs_migrate_page,
|
||||
};
|
||||
|
||||
|
||||
|
@ -98,6 +98,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
|
||||
err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
|
||||
#ifdef __ARCH_SI_TRAPNO
|
||||
err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
|
||||
#endif
|
||||
#ifdef BUS_MCEERR_AO
|
||||
/*
|
||||
* Other callers might not initialize the si_lsb field,
|
||||
* so check explicitly for the right codes here.
|
||||
*/
|
||||
if (kinfo->si_code == BUS_MCEERR_AR ||
|
||||
kinfo->si_code == BUS_MCEERR_AO)
|
||||
err |= __put_user((short) kinfo->si_addr_lsb,
|
||||
&uinfo->ssi_addr_lsb);
|
||||
#endif
|
||||
break;
|
||||
case __SI_CHLD:
|
||||
|
@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
|
||||
struct vm_area_struct *vma,
|
||||
int acctflags);
|
||||
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
|
||||
void __isolate_hwpoisoned_huge_page(struct page *page);
|
||||
int dequeue_hwpoisoned_huge_page(struct page *page);
|
||||
void copy_huge_page(struct page *dst, struct page *src);
|
||||
|
||||
extern unsigned long hugepages_treat_as_movable;
|
||||
extern const unsigned long hugetlb_zero, hugetlb_infinity;
|
||||
@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
|
||||
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
|
||||
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
|
||||
#define huge_pte_offset(mm, address) 0
|
||||
#define __isolate_hwpoisoned_huge_page(page) 0
|
||||
#define dequeue_hwpoisoned_huge_page(page) 0
|
||||
static inline void copy_huge_page(struct page *dst, struct page *src)
|
||||
{
|
||||
}
|
||||
|
||||
#define hugetlb_change_protection(vma, address, end, newprot)
|
||||
|
||||
@ -228,6 +232,8 @@ struct huge_bootmem_page {
|
||||
struct hstate *hstate;
|
||||
};
|
||||
|
||||
struct page *alloc_huge_page_node(struct hstate *h, int nid);
|
||||
|
||||
/* arch callback */
|
||||
int __init alloc_bootmem_huge_page(struct hstate *h);
|
||||
|
||||
@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page)
|
||||
return size_to_hstate(PAGE_SIZE << compound_order(page));
|
||||
}
|
||||
|
||||
static inline unsigned hstate_index_to_shift(unsigned index)
|
||||
{
|
||||
return hstates[index].order + PAGE_SHIFT;
|
||||
}
|
||||
|
||||
#else
|
||||
struct hstate {};
|
||||
#define alloc_huge_page_node(h, nid) NULL
|
||||
#define alloc_bootmem_huge_page(h) NULL
|
||||
#define hstate_file(f) NULL
|
||||
#define hstate_vma(v) NULL
|
||||
@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
#define hstate_index_to_shift(index) 0
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_HUGETLB_H */
|
||||
|
@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
|
||||
struct page *, struct page *);
|
||||
extern int migrate_pages(struct list_head *l, new_page_t x,
|
||||
unsigned long private, int offlining);
|
||||
extern int migrate_huge_pages(struct list_head *l, new_page_t x,
|
||||
unsigned long private, int offlining);
|
||||
|
||||
extern int fail_migrate_page(struct address_space *,
|
||||
struct page *, struct page *);
|
||||
@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
|
||||
extern int migrate_vmas(struct mm_struct *mm,
|
||||
const nodemask_t *from, const nodemask_t *to,
|
||||
unsigned long flags);
|
||||
extern void migrate_page_copy(struct page *newpage, struct page *page);
|
||||
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
struct page *newpage, struct page *page);
|
||||
#else
|
||||
#define PAGE_MIGRATION 0
|
||||
|
||||
static inline void putback_lru_pages(struct list_head *l) {}
|
||||
static inline int migrate_pages(struct list_head *l, new_page_t x,
|
||||
unsigned long private, int offlining) { return -ENOSYS; }
|
||||
static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
|
||||
unsigned long private, int offlining) { return -ENOSYS; }
|
||||
|
||||
static inline int migrate_prep(void) { return -ENOSYS; }
|
||||
static inline int migrate_prep_local(void) { return -ENOSYS; }
|
||||
@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
static inline void migrate_page_copy(struct page *newpage,
|
||||
struct page *page) {}
|
||||
|
||||
static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
struct page *newpage, struct page *page)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
/* Possible settings for the migrate_page() method in address_operations */
|
||||
#define migrate_page NULL
|
||||
#define fail_migrate_page NULL
|
||||
|
@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page)
|
||||
#define VM_FAULT_SIGBUS 0x0002
|
||||
#define VM_FAULT_MAJOR 0x0004
|
||||
#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
|
||||
#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */
|
||||
#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
|
||||
#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
|
||||
|
||||
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
|
||||
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
|
||||
|
||||
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
|
||||
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
|
||||
|
||||
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
|
||||
VM_FAULT_HWPOISON_LARGE)
|
||||
|
||||
/* Encode hstate index for a hwpoisoned large page */
|
||||
#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
|
||||
#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
|
||||
|
||||
/*
|
||||
* Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
|
||||
|
@ -33,6 +33,7 @@ struct signalfd_siginfo {
|
||||
__u64 ssi_utime;
|
||||
__u64 ssi_stime;
|
||||
__u64 ssi_addr;
|
||||
__u16 ssi_addr_lsb;
|
||||
|
||||
/*
|
||||
* Pad strcture to 128 bytes. Remember to update the
|
||||
@ -43,7 +44,7 @@ struct signalfd_siginfo {
|
||||
* comes out of a read(2) and we really don't want to have
|
||||
* a compat on read(2).
|
||||
*/
|
||||
__u8 __pad[48];
|
||||
__u8 __pad[46];
|
||||
};
|
||||
|
||||
|
||||
|
233
mm/hugetlb.c
233
mm/hugetlb.c
@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
|
||||
}
|
||||
}
|
||||
|
||||
static void copy_gigantic_page(struct page *dst, struct page *src,
|
||||
static void copy_user_gigantic_page(struct page *dst, struct page *src,
|
||||
unsigned long addr, struct vm_area_struct *vma)
|
||||
{
|
||||
int i;
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
struct page *dst_base = dst;
|
||||
struct page *src_base = src;
|
||||
might_sleep();
|
||||
|
||||
for (i = 0; i < pages_per_huge_page(h); ) {
|
||||
cond_resched();
|
||||
copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
|
||||
@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
|
||||
src = mem_map_next(src, src_base, i);
|
||||
}
|
||||
}
|
||||
static void copy_huge_page(struct page *dst, struct page *src,
|
||||
|
||||
static void copy_user_huge_page(struct page *dst, struct page *src,
|
||||
unsigned long addr, struct vm_area_struct *vma)
|
||||
{
|
||||
int i;
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
|
||||
if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
|
||||
copy_gigantic_page(dst, src, addr, vma);
|
||||
copy_user_gigantic_page(dst, src, addr, vma);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
|
||||
}
|
||||
}
|
||||
|
||||
static void copy_gigantic_page(struct page *dst, struct page *src)
|
||||
{
|
||||
int i;
|
||||
struct hstate *h = page_hstate(src);
|
||||
struct page *dst_base = dst;
|
||||
struct page *src_base = src;
|
||||
|
||||
for (i = 0; i < pages_per_huge_page(h); ) {
|
||||
cond_resched();
|
||||
copy_highpage(dst, src);
|
||||
|
||||
i++;
|
||||
dst = mem_map_next(dst, dst_base, i);
|
||||
src = mem_map_next(src, src_base, i);
|
||||
}
|
||||
}
|
||||
|
||||
void copy_huge_page(struct page *dst, struct page *src)
|
||||
{
|
||||
int i;
|
||||
struct hstate *h = page_hstate(src);
|
||||
|
||||
if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
|
||||
copy_gigantic_page(dst, src);
|
||||
return;
|
||||
}
|
||||
|
||||
might_sleep();
|
||||
for (i = 0; i < pages_per_huge_page(h); i++) {
|
||||
cond_resched();
|
||||
copy_highpage(dst + i, src + i);
|
||||
}
|
||||
}
|
||||
|
||||
static void enqueue_huge_page(struct hstate *h, struct page *page)
|
||||
{
|
||||
int nid = page_to_nid(page);
|
||||
@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
|
||||
h->free_huge_pages_node[nid]++;
|
||||
}
|
||||
|
||||
static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
if (list_empty(&h->hugepage_freelists[nid]))
|
||||
return NULL;
|
||||
page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
|
||||
list_del(&page->lru);
|
||||
set_page_refcounted(page);
|
||||
h->free_huge_pages--;
|
||||
h->free_huge_pages_node[nid]--;
|
||||
return page;
|
||||
}
|
||||
|
||||
static struct page *dequeue_huge_page_vma(struct hstate *h,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long address, int avoid_reserve)
|
||||
{
|
||||
int nid;
|
||||
struct page *page = NULL;
|
||||
struct mempolicy *mpol;
|
||||
nodemask_t *nodemask;
|
||||
@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
|
||||
|
||||
for_each_zone_zonelist_nodemask(zone, z, zonelist,
|
||||
MAX_NR_ZONES - 1, nodemask) {
|
||||
nid = zone_to_nid(zone);
|
||||
if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
|
||||
!list_empty(&h->hugepage_freelists[nid])) {
|
||||
page = list_entry(h->hugepage_freelists[nid].next,
|
||||
struct page, lru);
|
||||
list_del(&page->lru);
|
||||
h->free_huge_pages--;
|
||||
h->free_huge_pages_node[nid]--;
|
||||
|
||||
if (!avoid_reserve)
|
||||
decrement_hugepage_resv_vma(h, vma);
|
||||
|
||||
break;
|
||||
if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
|
||||
page = dequeue_huge_page_node(h, zone_to_nid(zone));
|
||||
if (page) {
|
||||
if (!avoid_reserve)
|
||||
decrement_hugepage_resv_vma(h, vma);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
err:
|
||||
@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct page *alloc_buddy_huge_page(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned int nid;
|
||||
unsigned int r_nid;
|
||||
|
||||
if (h->order >= MAX_ORDER)
|
||||
return NULL;
|
||||
@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
|
||||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
|
||||
page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
|
||||
__GFP_REPEAT|__GFP_NOWARN,
|
||||
huge_page_order(h));
|
||||
if (nid == NUMA_NO_NODE)
|
||||
page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
|
||||
__GFP_REPEAT|__GFP_NOWARN,
|
||||
huge_page_order(h));
|
||||
else
|
||||
page = alloc_pages_exact_node(nid,
|
||||
htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
|
||||
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
|
||||
|
||||
if (page && arch_prepare_hugepage(page)) {
|
||||
__free_pages(page, huge_page_order(h));
|
||||
@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
if (page) {
|
||||
/*
|
||||
* This page is now managed by the hugetlb allocator and has
|
||||
* no users -- drop the buddy allocator's reference.
|
||||
*/
|
||||
put_page_testzero(page);
|
||||
VM_BUG_ON(page_count(page));
|
||||
nid = page_to_nid(page);
|
||||
r_nid = page_to_nid(page);
|
||||
set_compound_page_dtor(page, free_huge_page);
|
||||
/*
|
||||
* We incremented the global counters already
|
||||
*/
|
||||
h->nr_huge_pages_node[nid]++;
|
||||
h->surplus_huge_pages_node[nid]++;
|
||||
h->nr_huge_pages_node[r_nid]++;
|
||||
h->surplus_huge_pages_node[r_nid]++;
|
||||
__count_vm_event(HTLB_BUDDY_PGALLOC);
|
||||
} else {
|
||||
h->nr_huge_pages--;
|
||||
@ -847,6 +887,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
* This allocation function is useful in the context where vma is irrelevant.
|
||||
* E.g. soft-offlining uses this function because it only cares physical
|
||||
* address of error page.
|
||||
*/
|
||||
struct page *alloc_huge_page_node(struct hstate *h, int nid)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
page = dequeue_huge_page_node(h, nid);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
|
||||
if (!page)
|
||||
page = alloc_buddy_huge_page(h, nid);
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
* Increase the hugetlb pool such that it can accomodate a reservation
|
||||
* of size 'delta'.
|
||||
@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
|
||||
retry:
|
||||
spin_unlock(&hugetlb_lock);
|
||||
for (i = 0; i < needed; i++) {
|
||||
page = alloc_buddy_huge_page(h, NULL, 0);
|
||||
if (!page) {
|
||||
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
|
||||
if (!page)
|
||||
/*
|
||||
* We were not able to allocate enough pages to
|
||||
* satisfy the entire reservation so we free what
|
||||
* we've allocated so far.
|
||||
*/
|
||||
spin_lock(&hugetlb_lock);
|
||||
needed = 0;
|
||||
goto free;
|
||||
}
|
||||
|
||||
list_add(&page->lru, &surplus_list);
|
||||
}
|
||||
@ -908,31 +964,31 @@ retry:
|
||||
needed += allocated;
|
||||
h->resv_huge_pages += delta;
|
||||
ret = 0;
|
||||
free:
|
||||
|
||||
spin_unlock(&hugetlb_lock);
|
||||
/* Free the needed pages to the hugetlb pool */
|
||||
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
|
||||
if ((--needed) < 0)
|
||||
break;
|
||||
list_del(&page->lru);
|
||||
/*
|
||||
* This page is now managed by the hugetlb allocator and has
|
||||
* no users -- drop the buddy allocator's reference.
|
||||
*/
|
||||
put_page_testzero(page);
|
||||
VM_BUG_ON(page_count(page));
|
||||
enqueue_huge_page(h, page);
|
||||
}
|
||||
|
||||
/* Free unnecessary surplus pages to the buddy allocator */
|
||||
free:
|
||||
if (!list_empty(&surplus_list)) {
|
||||
spin_unlock(&hugetlb_lock);
|
||||
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
|
||||
list_del(&page->lru);
|
||||
/*
|
||||
* The page has a reference count of zero already, so
|
||||
* call free_huge_page directly instead of using
|
||||
* put_page. This must be done with hugetlb_lock
|
||||
* unlocked which is safe because free_huge_page takes
|
||||
* hugetlb_lock before deciding how to free the page.
|
||||
*/
|
||||
free_huge_page(page);
|
||||
put_page(page);
|
||||
}
|
||||
spin_lock(&hugetlb_lock);
|
||||
}
|
||||
spin_lock(&hugetlb_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
spin_unlock(&hugetlb_lock);
|
||||
|
||||
if (!page) {
|
||||
page = alloc_buddy_huge_page(h, vma, addr);
|
||||
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
|
||||
if (!page) {
|
||||
hugetlb_put_quota(inode->i_mapping, chg);
|
||||
return ERR_PTR(-VM_FAULT_SIGBUS);
|
||||
}
|
||||
}
|
||||
|
||||
set_page_refcounted(page);
|
||||
set_page_private(page, (unsigned long) mapping);
|
||||
|
||||
vma_commit_reservation(h, vma, addr);
|
||||
@ -2153,6 +2208,19 @@ nomem:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static int is_hugetlb_entry_migration(pte_t pte)
|
||||
{
|
||||
swp_entry_t swp;
|
||||
|
||||
if (huge_pte_none(pte) || pte_present(pte))
|
||||
return 0;
|
||||
swp = pte_to_swp_entry(pte);
|
||||
if (non_swap_entry(swp) && is_migration_entry(swp)) {
|
||||
return 1;
|
||||
} else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int is_hugetlb_entry_hwpoisoned(pte_t pte)
|
||||
{
|
||||
swp_entry_t swp;
|
||||
@ -2383,7 +2451,7 @@ retry_avoidcopy:
|
||||
if (unlikely(anon_vma_prepare(vma)))
|
||||
return VM_FAULT_OOM;
|
||||
|
||||
copy_huge_page(new_page, old_page, address, vma);
|
||||
copy_user_huge_page(new_page, old_page, address, vma);
|
||||
__SetPageUptodate(new_page);
|
||||
|
||||
/*
|
||||
@ -2515,21 +2583,19 @@ retry:
|
||||
hugepage_add_new_anon_rmap(page, vma, address);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If memory error occurs between mmap() and fault, some process
|
||||
* don't have hwpoisoned swap entry for errored virtual address.
|
||||
* So we need to block hugepage fault by PG_hwpoison bit check.
|
||||
*/
|
||||
if (unlikely(PageHWPoison(page))) {
|
||||
ret = VM_FAULT_HWPOISON |
|
||||
VM_FAULT_SET_HINDEX(h - hstates);
|
||||
goto backout_unlocked;
|
||||
}
|
||||
page_dup_rmap(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Since memory error handler replaces pte into hwpoison swap entry
|
||||
* at the time of error handling, a process which reserved but not have
|
||||
* the mapping to the error hugepage does not have hwpoison swap entry.
|
||||
* So we need to block accesses from such a process by checking
|
||||
* PG_hwpoison bit here.
|
||||
*/
|
||||
if (unlikely(PageHWPoison(page))) {
|
||||
ret = VM_FAULT_HWPOISON;
|
||||
goto backout_unlocked;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are going to COW a private mapping later, we examine the
|
||||
* pending reservations for this page now. This will ensure that
|
||||
@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
ptep = huge_pte_offset(mm, address);
|
||||
if (ptep) {
|
||||
entry = huge_ptep_get(ptep);
|
||||
if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
|
||||
return VM_FAULT_HWPOISON;
|
||||
if (unlikely(is_hugetlb_entry_migration(entry))) {
|
||||
migration_entry_wait(mm, (pmd_t *)ptep, address);
|
||||
return 0;
|
||||
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
|
||||
return VM_FAULT_HWPOISON_LARGE |
|
||||
VM_FAULT_SET_HINDEX(h - hstates);
|
||||
}
|
||||
|
||||
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
|
||||
@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
||||
hugetlb_acct_memory(h, -(chg - freed));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
|
||||
/* Should be called in hugetlb_lock */
|
||||
static int is_hugepage_on_freelist(struct page *hpage)
|
||||
{
|
||||
struct page *page;
|
||||
struct page *tmp;
|
||||
struct hstate *h = page_hstate(hpage);
|
||||
int nid = page_to_nid(hpage);
|
||||
|
||||
list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
|
||||
if (page == hpage)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is called from memory failure code.
|
||||
* Assume the caller holds page lock of the head page.
|
||||
*/
|
||||
void __isolate_hwpoisoned_huge_page(struct page *hpage)
|
||||
int dequeue_hwpoisoned_huge_page(struct page *hpage)
|
||||
{
|
||||
struct hstate *h = page_hstate(hpage);
|
||||
int nid = page_to_nid(hpage);
|
||||
int ret = -EBUSY;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
list_del(&hpage->lru);
|
||||
h->free_huge_pages--;
|
||||
h->free_huge_pages_node[nid]--;
|
||||
if (is_hugepage_on_freelist(hpage)) {
|
||||
list_del(&hpage->lru);
|
||||
set_page_refcounted(hpage);
|
||||
h->free_huge_pages--;
|
||||
h->free_huge_pages_node[nid]--;
|
||||
ret = 0;
|
||||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
@ -7,21 +7,26 @@
|
||||
* Free Software Foundation.
|
||||
*
|
||||
* High level machine check handler. Handles pages reported by the
|
||||
* hardware as being corrupted usually due to a 2bit ECC memory or cache
|
||||
* hardware as being corrupted usually due to a multi-bit ECC memory or cache
|
||||
* failure.
|
||||
*
|
||||
* In addition there is a "soft offline" entry point that allows stop using
|
||||
* not-yet-corrupted-by-suspicious pages without killing anything.
|
||||
*
|
||||
* Handles page cache pages in various states. The tricky part
|
||||
* here is that we can access any page asynchronous to other VM
|
||||
* users, because memory failures could happen anytime and anywhere,
|
||||
* possibly violating some of their assumptions. This is why this code
|
||||
* has to be extremely careful. Generally it tries to use normal locking
|
||||
* rules, as in get the standard locks, even if that means the
|
||||
* error handling takes potentially a long time.
|
||||
*
|
||||
* The operation to map back from RMAP chains to processes has to walk
|
||||
* the complete process list and has non linear complexity with the number
|
||||
* mappings. In short it can be quite slow. But since memory corruptions
|
||||
* are rare we hope to get away with this.
|
||||
* here is that we can access any page asynchronously in respect to
|
||||
* other VM users, because memory failures could happen anytime and
|
||||
* anywhere. This could violate some of their assumptions. This is why
|
||||
* this code has to be extremely careful. Generally it tries to use
|
||||
* normal locking rules, as in get the standard locks, even if that means
|
||||
* the error handling takes potentially a long time.
|
||||
*
|
||||
* There are several operations here with exponential complexity because
|
||||
* of unsuitable VM data structures. For example the operation to map back
|
||||
* from RMAP chains to processes has to walk the complete process list and
|
||||
* has non linear complexity with the number. But since memory corruptions
|
||||
* are rare we hope to get away with this. This avoids impacting the core
|
||||
* VM.
|
||||
*/
|
||||
|
||||
/*
|
||||
@ -30,7 +35,6 @@
|
||||
* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
|
||||
* - pass bad pages to kdump next kernel
|
||||
*/
|
||||
#define DEBUG 1 /* remove me in 2.6.34 */
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/page-flags.h>
|
||||
@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* page_mapping() does not accept slab page
|
||||
* page_mapping() does not accept slab pages.
|
||||
*/
|
||||
if (PageSlab(p))
|
||||
return -EINVAL;
|
||||
@ -268,7 +272,7 @@ struct to_kill {
|
||||
struct list_head nd;
|
||||
struct task_struct *tsk;
|
||||
unsigned long addr;
|
||||
unsigned addr_valid:1;
|
||||
char addr_valid;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
|
||||
* a SIGKILL because the error is not contained anymore.
|
||||
*/
|
||||
if (tk->addr == -EFAULT) {
|
||||
pr_debug("MCE: Unable to find user space address %lx in %s\n",
|
||||
pr_info("MCE: Unable to find user space address %lx in %s\n",
|
||||
page_to_pfn(p), tsk->comm);
|
||||
tk->addr_valid = 0;
|
||||
}
|
||||
@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
|
||||
pfn, err);
|
||||
} else if (page_has_private(p) &&
|
||||
!try_to_release_page(p, GFP_NOIO)) {
|
||||
pr_debug("MCE %#lx: failed to release buffers\n", pfn);
|
||||
pr_info("MCE %#lx: failed to release buffers\n", pfn);
|
||||
} else {
|
||||
ret = RECOVERED;
|
||||
}
|
||||
@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
|
||||
* Issues:
|
||||
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
|
||||
* To narrow down kill region to one page, we need to break up pmd.
|
||||
* - To support soft-offlining for hugepage, we need to support hugepage
|
||||
* migration.
|
||||
*/
|
||||
static int me_huge_page(struct page *p, unsigned long pfn)
|
||||
{
|
||||
int res = 0;
|
||||
struct page *hpage = compound_head(p);
|
||||
/*
|
||||
* We can safely recover from error on free or reserved (i.e.
|
||||
@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
|
||||
* so there is no race between isolation and mapping/unmapping.
|
||||
*/
|
||||
if (!(page_mapping(hpage) || PageAnon(hpage))) {
|
||||
__isolate_hwpoisoned_huge_page(hpage);
|
||||
return RECOVERED;
|
||||
res = dequeue_hwpoisoned_huge_page(hpage);
|
||||
if (!res)
|
||||
return RECOVERED;
|
||||
}
|
||||
return DELAYED;
|
||||
}
|
||||
@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
|
||||
return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
|
||||
}
|
||||
|
||||
#define N_UNMAP_TRIES 5
|
||||
|
||||
/*
|
||||
* Do all that is necessary to remove user space mappings. Unmap
|
||||
* the pages and send SIGBUS to the processes if the data was dirty.
|
||||
@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||
struct address_space *mapping;
|
||||
LIST_HEAD(tokill);
|
||||
int ret;
|
||||
int i;
|
||||
int kill = 1;
|
||||
struct page *hpage = compound_head(p);
|
||||
|
||||
@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||
if (kill)
|
||||
collect_procs(hpage, &tokill);
|
||||
|
||||
/*
|
||||
* try_to_unmap can fail temporarily due to races.
|
||||
* Try a few times (RED-PEN better strategy?)
|
||||
*/
|
||||
for (i = 0; i < N_UNMAP_TRIES; i++) {
|
||||
ret = try_to_unmap(hpage, ttu);
|
||||
if (ret == SWAP_SUCCESS)
|
||||
break;
|
||||
pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
|
||||
}
|
||||
|
||||
ret = try_to_unmap(hpage, ttu);
|
||||
if (ret != SWAP_SUCCESS)
|
||||
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
|
||||
pfn, page_mapcount(hpage));
|
||||
@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
* We need/can do nothing about count=0 pages.
|
||||
* 1) it's a free page, and therefore in safe hand:
|
||||
* prep_new_page() will be the gate keeper.
|
||||
* 2) it's part of a non-compound high order page.
|
||||
* 2) it's a free hugepage, which is also safe:
|
||||
* an affected hugepage will be dequeued from hugepage freelist,
|
||||
* so there's no concern about reusing it ever after.
|
||||
* 3) it's part of a non-compound high order page.
|
||||
* Implies some kernel user: cannot stop them from
|
||||
* R/W the page; let's pray that the page has been
|
||||
* used and will be freed some time later.
|
||||
@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
if (is_free_buddy_page(p)) {
|
||||
action_result(pfn, "free buddy", DELAYED);
|
||||
return 0;
|
||||
} else if (PageHuge(hpage)) {
|
||||
/*
|
||||
* Check "just unpoisoned", "filter hit", and
|
||||
* "race with other subpage."
|
||||
*/
|
||||
lock_page_nosync(hpage);
|
||||
if (!PageHWPoison(hpage)
|
||||
|| (hwpoison_filter(p) && TestClearPageHWPoison(p))
|
||||
|| (p != hpage && TestSetPageHWPoison(hpage))) {
|
||||
atomic_long_sub(nr_pages, &mce_bad_pages);
|
||||
return 0;
|
||||
}
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
res = dequeue_hwpoisoned_huge_page(hpage);
|
||||
action_result(pfn, "free huge",
|
||||
res ? IGNORED : DELAYED);
|
||||
unlock_page(hpage);
|
||||
return res;
|
||||
} else {
|
||||
action_result(pfn, "high order kernel", IGNORED);
|
||||
return -EBUSY;
|
||||
@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn)
|
||||
page = compound_head(p);
|
||||
|
||||
if (!PageHWPoison(p)) {
|
||||
pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
|
||||
pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
|
||||
return 0;
|
||||
}
|
||||
|
||||
nr_pages = 1 << compound_order(page);
|
||||
|
||||
if (!get_page_unless_zero(page)) {
|
||||
/*
|
||||
* Since HWPoisoned hugepage should have non-zero refcount,
|
||||
* race between memory failure and unpoison seems to happen.
|
||||
* In such case unpoison fails and memory failure runs
|
||||
* to the end.
|
||||
*/
|
||||
if (PageHuge(page)) {
|
||||
pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
|
||||
return 0;
|
||||
}
|
||||
if (TestClearPageHWPoison(p))
|
||||
atomic_long_sub(nr_pages, &mce_bad_pages);
|
||||
pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
|
||||
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn)
|
||||
* the free buddy page pool.
|
||||
*/
|
||||
if (TestClearPageHWPoison(page)) {
|
||||
pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
|
||||
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
|
||||
atomic_long_sub(nr_pages, &mce_bad_pages);
|
||||
freeit = 1;
|
||||
if (PageHuge(page))
|
||||
clear_page_hwpoison_huge_page(page);
|
||||
}
|
||||
if (PageHuge(p))
|
||||
clear_page_hwpoison_huge_page(page);
|
||||
unlock_page(page);
|
||||
|
||||
put_page(page);
|
||||
@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
|
||||
static struct page *new_page(struct page *p, unsigned long private, int **x)
|
||||
{
|
||||
int nid = page_to_nid(p);
|
||||
return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
|
||||
if (PageHuge(p))
|
||||
return alloc_huge_page_node(page_hstate(compound_head(p)),
|
||||
nid);
|
||||
else
|
||||
return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
|
||||
* was free.
|
||||
*/
|
||||
set_migratetype_isolate(p);
|
||||
/*
|
||||
* When the target page is a free hugepage, just remove it
|
||||
* from free hugepage list.
|
||||
*/
|
||||
if (!get_page_unless_zero(compound_head(p))) {
|
||||
if (is_free_buddy_page(p)) {
|
||||
pr_debug("get_any_page: %#lx free buddy page\n", pfn);
|
||||
if (PageHuge(p)) {
|
||||
pr_info("get_any_page: %#lx free huge page\n", pfn);
|
||||
ret = dequeue_hwpoisoned_huge_page(compound_head(p));
|
||||
} else if (is_free_buddy_page(p)) {
|
||||
pr_info("get_any_page: %#lx free buddy page\n", pfn);
|
||||
/* Set hwpoison bit while page is still isolated */
|
||||
SetPageHWPoison(p);
|
||||
ret = 0;
|
||||
} else {
|
||||
pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
|
||||
pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
|
||||
pfn, p->flags);
|
||||
ret = -EIO;
|
||||
}
|
||||
@ -1235,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int soft_offline_huge_page(struct page *page, int flags)
|
||||
{
|
||||
int ret;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
struct page *hpage = compound_head(page);
|
||||
LIST_HEAD(pagelist);
|
||||
|
||||
ret = get_any_page(page, pfn, flags);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret == 0)
|
||||
goto done;
|
||||
|
||||
if (PageHWPoison(hpage)) {
|
||||
put_page(hpage);
|
||||
pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
/* Keep page count to indicate a given hugepage is isolated. */
|
||||
|
||||
list_add(&hpage->lru, &pagelist);
|
||||
ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
|
||||
if (ret) {
|
||||
pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
|
||||
pfn, ret, page->flags);
|
||||
if (ret > 0)
|
||||
ret = -EIO;
|
||||
return ret;
|
||||
}
|
||||
done:
|
||||
if (!PageHWPoison(hpage))
|
||||
atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
dequeue_hwpoisoned_huge_page(hpage);
|
||||
/* keep elevated page count for bad page */
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* soft_offline_page - Soft offline a page.
|
||||
* @page: page to offline
|
||||
@ -1262,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags)
|
||||
int ret;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
|
||||
if (PageHuge(page))
|
||||
return soft_offline_huge_page(page, flags);
|
||||
|
||||
ret = get_any_page(page, pfn, flags);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
@ -1288,7 +1363,7 @@ int soft_offline_page(struct page *page, int flags)
|
||||
goto done;
|
||||
}
|
||||
if (!PageLRU(page)) {
|
||||
pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
|
||||
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
|
||||
pfn, page->flags);
|
||||
return -EIO;
|
||||
}
|
||||
@ -1302,7 +1377,7 @@ int soft_offline_page(struct page *page, int flags)
|
||||
if (PageHWPoison(page)) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
pr_debug("soft offline: %#lx page already poisoned\n", pfn);
|
||||
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
@ -1323,7 +1398,7 @@ int soft_offline_page(struct page *page, int flags)
|
||||
put_page(page);
|
||||
if (ret == 1) {
|
||||
ret = 0;
|
||||
pr_debug("soft_offline: %#lx: invalidated\n", pfn);
|
||||
pr_info("soft_offline: %#lx: invalidated\n", pfn);
|
||||
goto done;
|
||||
}
|
||||
|
||||
@ -1339,13 +1414,13 @@ int soft_offline_page(struct page *page, int flags)
|
||||
list_add(&page->lru, &pagelist);
|
||||
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
|
||||
if (ret) {
|
||||
pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
|
||||
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
|
||||
pfn, ret, page->flags);
|
||||
if (ret > 0)
|
||||
ret = -EIO;
|
||||
}
|
||||
} else {
|
||||
pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
|
||||
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
|
||||
pfn, ret, page_count(page), page->flags);
|
||||
}
|
||||
if (ret)
|
||||
|
@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
if (ret & VM_FAULT_OOM)
|
||||
return i ? i : -ENOMEM;
|
||||
if (ret &
|
||||
(VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
|
||||
(VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
|
||||
VM_FAULT_SIGBUS))
|
||||
return i ? i : -EFAULT;
|
||||
BUG();
|
||||
}
|
||||
|
234
mm/migrate.c
234
mm/migrate.c
@ -32,6 +32,7 @@
|
||||
#include <linux/security.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/gfp.h>
|
||||
|
||||
#include "internal.h"
|
||||
@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
|
||||
pte_t *ptep, pte;
|
||||
spinlock_t *ptl;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
if (!pgd_present(*pgd))
|
||||
goto out;
|
||||
if (unlikely(PageHuge(new))) {
|
||||
ptep = huge_pte_offset(mm, addr);
|
||||
if (!ptep)
|
||||
goto out;
|
||||
ptl = &mm->page_table_lock;
|
||||
} else {
|
||||
pgd = pgd_offset(mm, addr);
|
||||
if (!pgd_present(*pgd))
|
||||
goto out;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
if (!pud_present(*pud))
|
||||
goto out;
|
||||
pud = pud_offset(pgd, addr);
|
||||
if (!pud_present(*pud))
|
||||
goto out;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (!pmd_present(*pmd))
|
||||
goto out;
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (!pmd_present(*pmd))
|
||||
goto out;
|
||||
|
||||
ptep = pte_offset_map(pmd, addr);
|
||||
ptep = pte_offset_map(pmd, addr);
|
||||
|
||||
if (!is_swap_pte(*ptep)) {
|
||||
pte_unmap(ptep);
|
||||
goto out;
|
||||
}
|
||||
if (!is_swap_pte(*ptep)) {
|
||||
pte_unmap(ptep);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ptl = pte_lockptr(mm, pmd);
|
||||
}
|
||||
|
||||
ptl = pte_lockptr(mm, pmd);
|
||||
spin_lock(ptl);
|
||||
pte = *ptep;
|
||||
if (!is_swap_pte(pte))
|
||||
@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
|
||||
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
|
||||
if (is_write_migration_entry(entry))
|
||||
pte = pte_mkwrite(pte);
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
if (PageHuge(new))
|
||||
pte = pte_mkhuge(pte);
|
||||
#endif
|
||||
flush_cache_page(vma, addr, pte_pfn(pte));
|
||||
set_pte_at(mm, addr, ptep, pte);
|
||||
|
||||
if (PageAnon(new))
|
||||
if (PageHuge(new)) {
|
||||
if (PageAnon(new))
|
||||
hugepage_add_anon_rmap(new, vma, addr);
|
||||
else
|
||||
page_dup_rmap(new);
|
||||
} else if (PageAnon(new))
|
||||
page_add_anon_rmap(new, vma, addr);
|
||||
else
|
||||
page_add_file_rmap(new);
|
||||
@ -275,12 +293,60 @@ static int migrate_page_move_mapping(struct address_space *mapping,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* The expected number of remaining references is the same as that
|
||||
* of migrate_page_move_mapping().
|
||||
*/
|
||||
int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
struct page *newpage, struct page *page)
|
||||
{
|
||||
int expected_count;
|
||||
void **pslot;
|
||||
|
||||
if (!mapping) {
|
||||
if (page_count(page) != 1)
|
||||
return -EAGAIN;
|
||||
return 0;
|
||||
}
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
|
||||
pslot = radix_tree_lookup_slot(&mapping->page_tree,
|
||||
page_index(page));
|
||||
|
||||
expected_count = 2 + page_has_private(page);
|
||||
if (page_count(page) != expected_count ||
|
||||
(struct page *)radix_tree_deref_slot(pslot) != page) {
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
if (!page_freeze_refs(page, expected_count)) {
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
get_page(newpage);
|
||||
|
||||
radix_tree_replace_slot(pslot, newpage);
|
||||
|
||||
page_unfreeze_refs(page, expected_count);
|
||||
|
||||
__put_page(page);
|
||||
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the page to its new location
|
||||
*/
|
||||
static void migrate_page_copy(struct page *newpage, struct page *page)
|
||||
void migrate_page_copy(struct page *newpage, struct page *page)
|
||||
{
|
||||
copy_highpage(newpage, page);
|
||||
if (PageHuge(page))
|
||||
copy_huge_page(newpage, page);
|
||||
else
|
||||
copy_highpage(newpage, page);
|
||||
|
||||
if (PageError(page))
|
||||
SetPageError(newpage);
|
||||
@ -723,6 +789,92 @@ move_newpage:
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Counterpart of unmap_and_move_page() for hugepage migration.
|
||||
*
|
||||
* This function doesn't wait the completion of hugepage I/O
|
||||
* because there is no race between I/O and migration for hugepage.
|
||||
* Note that currently hugepage I/O occurs only in direct I/O
|
||||
* where no lock is held and PG_writeback is irrelevant,
|
||||
* and writeback status of all subpages are counted in the reference
|
||||
* count of the head page (i.e. if all subpages of a 2MB hugepage are
|
||||
* under direct I/O, the reference of the head page is 512 and a bit more.)
|
||||
* This means that when we try to migrate hugepage whose subpages are
|
||||
* doing direct I/O, some references remain after try_to_unmap() and
|
||||
* hugepage migration fails without data corruption.
|
||||
*
|
||||
* There is also no race when direct I/O is issued on the page under migration,
|
||||
* because then pte is replaced with migration swap entry and direct I/O code
|
||||
* will wait in the page fault for migration to complete.
|
||||
*/
|
||||
static int unmap_and_move_huge_page(new_page_t get_new_page,
|
||||
unsigned long private, struct page *hpage,
|
||||
int force, int offlining)
|
||||
{
|
||||
int rc = 0;
|
||||
int *result = NULL;
|
||||
struct page *new_hpage = get_new_page(hpage, private, &result);
|
||||
int rcu_locked = 0;
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
|
||||
if (!new_hpage)
|
||||
return -ENOMEM;
|
||||
|
||||
rc = -EAGAIN;
|
||||
|
||||
if (!trylock_page(hpage)) {
|
||||
if (!force)
|
||||
goto out;
|
||||
lock_page(hpage);
|
||||
}
|
||||
|
||||
if (PageAnon(hpage)) {
|
||||
rcu_read_lock();
|
||||
rcu_locked = 1;
|
||||
|
||||
if (page_mapped(hpage)) {
|
||||
anon_vma = page_anon_vma(hpage);
|
||||
atomic_inc(&anon_vma->external_refcount);
|
||||
}
|
||||
}
|
||||
|
||||
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
|
||||
|
||||
if (!page_mapped(hpage))
|
||||
rc = move_to_new_page(new_hpage, hpage, 1);
|
||||
|
||||
if (rc)
|
||||
remove_migration_ptes(hpage, hpage);
|
||||
|
||||
if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
|
||||
&anon_vma->lock)) {
|
||||
int empty = list_empty(&anon_vma->head);
|
||||
spin_unlock(&anon_vma->lock);
|
||||
if (empty)
|
||||
anon_vma_free(anon_vma);
|
||||
}
|
||||
|
||||
if (rcu_locked)
|
||||
rcu_read_unlock();
|
||||
out:
|
||||
unlock_page(hpage);
|
||||
|
||||
if (rc != -EAGAIN) {
|
||||
list_del(&hpage->lru);
|
||||
put_page(hpage);
|
||||
}
|
||||
|
||||
put_page(new_hpage);
|
||||
|
||||
if (result) {
|
||||
if (rc)
|
||||
*result = rc;
|
||||
else
|
||||
*result = page_to_nid(new_hpage);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* migrate_pages
|
||||
*
|
||||
@ -788,6 +940,52 @@ out:
|
||||
return nr_failed + retry;
|
||||
}
|
||||
|
||||
int migrate_huge_pages(struct list_head *from,
|
||||
new_page_t get_new_page, unsigned long private, int offlining)
|
||||
{
|
||||
int retry = 1;
|
||||
int nr_failed = 0;
|
||||
int pass = 0;
|
||||
struct page *page;
|
||||
struct page *page2;
|
||||
int rc;
|
||||
|
||||
for (pass = 0; pass < 10 && retry; pass++) {
|
||||
retry = 0;
|
||||
|
||||
list_for_each_entry_safe(page, page2, from, lru) {
|
||||
cond_resched();
|
||||
|
||||
rc = unmap_and_move_huge_page(get_new_page,
|
||||
private, page, pass > 2, offlining);
|
||||
|
||||
switch(rc) {
|
||||
case -ENOMEM:
|
||||
goto out;
|
||||
case -EAGAIN:
|
||||
retry++;
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
default:
|
||||
/* Permanent failure */
|
||||
nr_failed++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
rc = 0;
|
||||
out:
|
||||
|
||||
list_for_each_entry_safe(page, page2, from, lru)
|
||||
put_page(page);
|
||||
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
return nr_failed + retry;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
/*
|
||||
* Move a list of individual pages
|
||||
|
25
mm/rmap.c
25
mm/rmap.c
@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page,
|
||||
}
|
||||
|
||||
/**
|
||||
* __page_set_anon_rmap - setup new anonymous rmap
|
||||
* @page: the page to add the mapping to
|
||||
* @vma: the vm area in which the mapping is added
|
||||
* @address: the user virtual address mapped
|
||||
* __page_set_anon_rmap - set up new anonymous rmap
|
||||
* @page: Page to add to rmap
|
||||
* @vma: VM area to add page to.
|
||||
* @address: User virtual address of the mapping
|
||||
* @exclusive: the page is exclusively owned by the current process
|
||||
*/
|
||||
static void __page_set_anon_rmap(struct page *page,
|
||||
@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page,
|
||||
|
||||
BUG_ON(!anon_vma);
|
||||
|
||||
if (PageAnon(page))
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the page isn't exclusively mapped into this vma,
|
||||
* we must use the _oldest_ possible anon_vma for the
|
||||
* page mapping!
|
||||
*/
|
||||
if (!exclusive) {
|
||||
if (PageAnon(page))
|
||||
return;
|
||||
if (!exclusive)
|
||||
anon_vma = anon_vma->root;
|
||||
} else {
|
||||
/*
|
||||
* In this case, swapped-out-but-not-discarded swap-cache
|
||||
* is remapped. So, no need to update page->mapping here.
|
||||
* We convice anon_vma poitned by page->mapping is not obsolete
|
||||
* because vma->anon_vma is necessary to be a family of it.
|
||||
*/
|
||||
if (PageAnon(page))
|
||||
return;
|
||||
}
|
||||
|
||||
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
|
||||
page->mapping = (struct address_space *) anon_vma;
|
||||
|
Loading…
x
Reference in New Issue
Block a user