userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor
The custom events are queued in ctx->event_wqh not to disturb the fast-path-ed PF queue-wait-wakeup functions. The events to be generated (other than PF-s) are requested in UFFD_API ioctl with the uffd_api.features bits. Those, known by the kernel, are then turned on and reported back to the user-space. Link: http://lkml.kernel.org/r/20161216144821.5183-7-aarcange@redhat.com Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Michael Rapoport <RAPOPORT@il.ibm.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
6dcc27fd39
commit
9cd75c3cd4
@ -12,6 +12,7 @@
|
|||||||
* mm/ksm.c (mm hashing).
|
* mm/ksm.c (mm hashing).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <linux/list.h>
|
||||||
#include <linux/hashtable.h>
|
#include <linux/hashtable.h>
|
||||||
#include <linux/sched.h>
|
#include <linux/sched.h>
|
||||||
#include <linux/mm.h>
|
#include <linux/mm.h>
|
||||||
@ -45,12 +46,16 @@ struct userfaultfd_ctx {
|
|||||||
wait_queue_head_t fault_wqh;
|
wait_queue_head_t fault_wqh;
|
||||||
/* waitqueue head for the pseudo fd to wakeup poll/read */
|
/* waitqueue head for the pseudo fd to wakeup poll/read */
|
||||||
wait_queue_head_t fd_wqh;
|
wait_queue_head_t fd_wqh;
|
||||||
|
/* waitqueue head for events */
|
||||||
|
wait_queue_head_t event_wqh;
|
||||||
/* a refile sequence protected by fault_pending_wqh lock */
|
/* a refile sequence protected by fault_pending_wqh lock */
|
||||||
struct seqcount refile_seq;
|
struct seqcount refile_seq;
|
||||||
/* pseudo fd refcounting */
|
/* pseudo fd refcounting */
|
||||||
atomic_t refcount;
|
atomic_t refcount;
|
||||||
/* userfaultfd syscall flags */
|
/* userfaultfd syscall flags */
|
||||||
unsigned int flags;
|
unsigned int flags;
|
||||||
|
/* features requested from the userspace */
|
||||||
|
unsigned int features;
|
||||||
/* state machine */
|
/* state machine */
|
||||||
enum userfaultfd_state state;
|
enum userfaultfd_state state;
|
||||||
/* released */
|
/* released */
|
||||||
@ -142,6 +147,8 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
|
|||||||
VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
|
VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
|
||||||
VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
|
VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
|
||||||
VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
|
VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
|
||||||
|
VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
|
||||||
|
VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
|
||||||
VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
|
VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
|
||||||
VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
|
VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
|
||||||
mmdrop(ctx->mm);
|
mmdrop(ctx->mm);
|
||||||
@ -458,6 +465,59 @@ out:
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int __maybe_unused userfaultfd_event_wait_completion(
|
||||||
|
struct userfaultfd_ctx *ctx,
|
||||||
|
struct userfaultfd_wait_queue *ewq)
|
||||||
|
{
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
ewq->ctx = ctx;
|
||||||
|
init_waitqueue_entry(&ewq->wq, current);
|
||||||
|
|
||||||
|
spin_lock(&ctx->event_wqh.lock);
|
||||||
|
/*
|
||||||
|
* After the __add_wait_queue the uwq is visible to userland
|
||||||
|
* through poll/read().
|
||||||
|
*/
|
||||||
|
__add_wait_queue(&ctx->event_wqh, &ewq->wq);
|
||||||
|
for (;;) {
|
||||||
|
set_current_state(TASK_KILLABLE);
|
||||||
|
if (ewq->msg.event == 0)
|
||||||
|
break;
|
||||||
|
if (ACCESS_ONCE(ctx->released) ||
|
||||||
|
fatal_signal_pending(current)) {
|
||||||
|
ret = -1;
|
||||||
|
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
spin_unlock(&ctx->event_wqh.lock);
|
||||||
|
|
||||||
|
wake_up_poll(&ctx->fd_wqh, POLLIN);
|
||||||
|
schedule();
|
||||||
|
|
||||||
|
spin_lock(&ctx->event_wqh.lock);
|
||||||
|
}
|
||||||
|
__set_current_state(TASK_RUNNING);
|
||||||
|
spin_unlock(&ctx->event_wqh.lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ctx may go away after this if the userfault pseudo fd is
|
||||||
|
* already released.
|
||||||
|
*/
|
||||||
|
|
||||||
|
userfaultfd_ctx_put(ctx);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
|
||||||
|
struct userfaultfd_wait_queue *ewq)
|
||||||
|
{
|
||||||
|
ewq->msg.event = 0;
|
||||||
|
wake_up_locked(&ctx->event_wqh);
|
||||||
|
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
|
||||||
|
}
|
||||||
|
|
||||||
static int userfaultfd_release(struct inode *inode, struct file *file)
|
static int userfaultfd_release(struct inode *inode, struct file *file)
|
||||||
{
|
{
|
||||||
struct userfaultfd_ctx *ctx = file->private_data;
|
struct userfaultfd_ctx *ctx = file->private_data;
|
||||||
@ -546,6 +606,12 @@ static inline struct userfaultfd_wait_queue *find_userfault(
|
|||||||
return find_userfault_in(&ctx->fault_pending_wqh);
|
return find_userfault_in(&ctx->fault_pending_wqh);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline struct userfaultfd_wait_queue *find_userfault_evt(
|
||||||
|
struct userfaultfd_ctx *ctx)
|
||||||
|
{
|
||||||
|
return find_userfault_in(&ctx->event_wqh);
|
||||||
|
}
|
||||||
|
|
||||||
static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
|
static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
|
||||||
{
|
{
|
||||||
struct userfaultfd_ctx *ctx = file->private_data;
|
struct userfaultfd_ctx *ctx = file->private_data;
|
||||||
@ -577,6 +643,9 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
|
|||||||
smp_mb();
|
smp_mb();
|
||||||
if (waitqueue_active(&ctx->fault_pending_wqh))
|
if (waitqueue_active(&ctx->fault_pending_wqh))
|
||||||
ret = POLLIN;
|
ret = POLLIN;
|
||||||
|
else if (waitqueue_active(&ctx->event_wqh))
|
||||||
|
ret = POLLIN;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
default:
|
default:
|
||||||
WARN_ON_ONCE(1);
|
WARN_ON_ONCE(1);
|
||||||
@ -641,6 +710,19 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
spin_unlock(&ctx->fault_pending_wqh.lock);
|
spin_unlock(&ctx->fault_pending_wqh.lock);
|
||||||
|
|
||||||
|
spin_lock(&ctx->event_wqh.lock);
|
||||||
|
uwq = find_userfault_evt(ctx);
|
||||||
|
if (uwq) {
|
||||||
|
*msg = uwq->msg;
|
||||||
|
|
||||||
|
userfaultfd_event_complete(ctx, uwq);
|
||||||
|
spin_unlock(&ctx->event_wqh.lock);
|
||||||
|
ret = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
spin_unlock(&ctx->event_wqh.lock);
|
||||||
|
|
||||||
if (signal_pending(current)) {
|
if (signal_pending(current)) {
|
||||||
ret = -ERESTARTSYS;
|
ret = -ERESTARTSYS;
|
||||||
break;
|
break;
|
||||||
@ -1184,6 +1266,14 @@ out:
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline unsigned int uffd_ctx_features(__u64 user_features)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* For the current set of features the bits just coincide
|
||||||
|
*/
|
||||||
|
return (unsigned int)user_features;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* userland asks for a certain API version and we return which bits
|
* userland asks for a certain API version and we return which bits
|
||||||
* and ioctl commands are implemented in this kernel for such API
|
* and ioctl commands are implemented in this kernel for such API
|
||||||
@ -1202,19 +1292,21 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
|
|||||||
ret = -EFAULT;
|
ret = -EFAULT;
|
||||||
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
|
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
|
||||||
goto out;
|
goto out;
|
||||||
if (uffdio_api.api != UFFD_API || uffdio_api.features) {
|
if (uffdio_api.api != UFFD_API ||
|
||||||
|
(uffdio_api.features & ~UFFD_API_FEATURES)) {
|
||||||
memset(&uffdio_api, 0, sizeof(uffdio_api));
|
memset(&uffdio_api, 0, sizeof(uffdio_api));
|
||||||
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
|
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
|
||||||
goto out;
|
goto out;
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
uffdio_api.features = UFFD_API_FEATURES;
|
uffdio_api.features &= UFFD_API_FEATURES;
|
||||||
uffdio_api.ioctls = UFFD_API_IOCTLS;
|
uffdio_api.ioctls = UFFD_API_IOCTLS;
|
||||||
ret = -EFAULT;
|
ret = -EFAULT;
|
||||||
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
|
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
|
||||||
goto out;
|
goto out;
|
||||||
ctx->state = UFFD_STATE_RUNNING;
|
ctx->state = UFFD_STATE_RUNNING;
|
||||||
|
ctx->features = uffd_ctx_features(uffdio_api.features);
|
||||||
ret = 0;
|
ret = 0;
|
||||||
out:
|
out:
|
||||||
return ret;
|
return ret;
|
||||||
@ -1301,6 +1393,7 @@ static void init_once_userfaultfd_ctx(void *mem)
|
|||||||
|
|
||||||
init_waitqueue_head(&ctx->fault_pending_wqh);
|
init_waitqueue_head(&ctx->fault_pending_wqh);
|
||||||
init_waitqueue_head(&ctx->fault_wqh);
|
init_waitqueue_head(&ctx->fault_wqh);
|
||||||
|
init_waitqueue_head(&ctx->event_wqh);
|
||||||
init_waitqueue_head(&ctx->fd_wqh);
|
init_waitqueue_head(&ctx->fd_wqh);
|
||||||
seqcount_init(&ctx->refile_seq);
|
seqcount_init(&ctx->refile_seq);
|
||||||
}
|
}
|
||||||
@ -1341,6 +1434,7 @@ static struct file *userfaultfd_file_create(int flags)
|
|||||||
|
|
||||||
atomic_set(&ctx->refcount, 1);
|
atomic_set(&ctx->refcount, 1);
|
||||||
ctx->flags = flags;
|
ctx->flags = flags;
|
||||||
|
ctx->features = 0;
|
||||||
ctx->state = UFFD_STATE_WAIT_API;
|
ctx->state = UFFD_STATE_WAIT_API;
|
||||||
ctx->released = false;
|
ctx->released = false;
|
||||||
ctx->mm = current->mm;
|
ctx->mm = current->mm;
|
||||||
|
Loading…
Reference in New Issue
Block a user