diff --git a/include/linux/mm.h b/include/linux/mm.h index 9f08a11b355c..215327daffae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1686,6 +1686,16 @@ static inline int xchg_page_access_time(struct page *page, int time) last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS); return last_time << PAGE_ACCESS_TIME_BUCKETS; } + +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) +{ + unsigned int pid_bit; + + pid_bit = current->pid % BITS_PER_LONG; + if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids)) { + __set_bit(pid_bit, &vma->numab_state->access_pids); + } +} #else /* !CONFIG_NUMA_BALANCING */ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { @@ -1735,6 +1745,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) { return false; } + +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) +{ +} #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3e1a42673769..f8cbd8efc7cb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -477,6 +477,7 @@ struct vma_lock { struct vma_numab_state { unsigned long next_scan; + unsigned long access_pids; }; /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7072de1686d5..ef27b5931480 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2928,6 +2928,21 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } +static bool vma_is_accessed(struct vm_area_struct *vma) +{ + /* + * Allow unconditional access first two times, so that all the (pages) + * of VMAs get prot_none fault introduced irrespective of accesses. + * This is also done to avoid any side effect of task scanning + * amplifying the unfairness of disjoint set of VMAs' access. + */ + if (READ_ONCE(current->mm->numa_scan_seq) < 2) + return true; + + return test_bit(current->pid % BITS_PER_LONG, + &vma->numab_state->access_pids); +} + /* * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). @@ -3046,6 +3061,10 @@ static void task_numa_work(struct callback_head *work) vma->numab_state->next_scan)) continue; + /* Do not scan the VMA if task has not accessed */ + if (!vma_is_accessed(vma)) + continue; + do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); diff --git a/mm/memory.c b/mm/memory.c index 9999574a9636..f77fccb5310c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4661,6 +4661,9 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, { get_page(page); + /* Record the current PID acceesing VMA */ + vma_set_access_pid_bit(vma); + count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);