io_uring-futex-2023-10-30

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmVAUXUQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpuGsEADEs0/4uXb8kLUF/y0B0bY9jmwiw5id14g5
 TkAH9lbceV0Yv0E1tPeWYIz7Y7s83UOduFVZo4hRH8EysH3IYFZCI/ny3v2nJ1av
 lN7F7YegVOu6qx77e/CwLo7on14awHkSo8pUdCOm6tYLunLg42miRf+xTpSAL0Mg
 ONnt0WxWDOgdNvTaGwBPaVE78FAWK8nc2ACzonQGfzCl2VXOsSy9JaJJMv8eyXOf
 VVZCNcSvHh/zVznlC1YPoZh/bgS2UUJmIGL/XMQnM5qzbK1IPpzlN0cu8rje3s9b
 TUKBKqr6xhC9nyAS1qAjgZ98RfjVnzcbMX+aWEb/Z0y9XFJVSSQQdW+f9A/0KLZm
 jAejHJpNuqwEdB9MplHTXdeSDTkJH3YNbXvtwA6cc/KpZ1FVQXlhSJPp/mbOa7qe
 IIeg6SYt84uZ2HxflTtm+I1uVE9QMcsesy3FIK4kxhA8jSximQw+hPZ3xrv4AHLd
 cTkRAzfXPUFsJJQCgpv289QXobV/vsFhCFTHFxv63H+EGpJ7e1EaW6Eq0pAHG0Ai
 8kk5Ns29jzTVer1W3sMMeDaZ7S8hGRAyRC+Zb/0QxtGsmvxikB0qY1GpdRGPFueQ
 gOawhLZdhkigIsq0U1UGMpHKY0G1Sl9wvHuH2qzUKeWk+vFRv5RwR6zQuVJr2Jo/
 j3HgyYDs7Q==
 =Z0L0
 -----END PGP SIGNATURE-----

Merge tag 'io_uring-futex-2023-10-30' of git://git.kernel.dk/linux

Pull io_uring futex support from Jens Axboe:
 "This adds support for using futexes through io_uring - first futex
  wake and wait, and then the vectored variant of waiting, futex waitv.

  For both wait/wake/waitv, we support the bitset variant, as the
  'normal' variants can be easily implemented on top of that.

  PI and requeue are not supported through io_uring, just the above
  mentioned parts. This may change in the future, but in the spirit of
  keeping this small (and based on what people have been asking for),
  this is what we currently have.

  Wake support is pretty straight forward, most of the thought has gone
  into the wait side to avoid needing to offload wait operations to a
  blocking context. Instead, we rely on the usual callbacks to retry and
  post a completion event, when appropriate.

  As far as I can recall, the first request for futex support with
  io_uring came from Andres Freund, working on postgres. His aio rework
  of postgres was one of the early adopters of io_uring, and futex
  support was a natural extension for that. This is relevant from both a
  usability point of view, as well as for effiency and performance. In
  Andres's words, for the former:

     Futex wait support in io_uring makes it a lot easier to avoid
     deadlocks in concurrent programs that have their own buffer pool:
     Obviously pages in the application buffer pool have to be locked
     during IO. If the initiator of IO A needs to wait for a held lock
     B, the holder of lock B might wait for the IO A to complete. The
     ability to wait for a lock and IO completions at the same time
     provides an efficient way to avoid such deadlocks

  and in terms of effiency, even without unlocking the full potential
  yet, Andres says:

     Futex wake support in io_uring is useful because it allows for more
     efficient directed wakeups. For some "locks" postgres has queues
     implemented in userspace, with wakeup logic that cannot easily be
     implemented with FUTEX_WAKE_BITSET on a single "futex word"
     (imagine waiting for journal flushes to have completed up to a
     certain point).

     Thus a "lock release" sometimes need to wake up many processes in a
     row. A quick-and-dirty conversion to doing these wakeups via
     io_uring lead to a 3% throughput increase, with 12% fewer context
     switches, albeit in a fairly extreme workload"

* tag 'io_uring-futex-2023-10-30' of git://git.kernel.dk/linux:
  io_uring: add support for vectored futex waits
  futex: make the vectored futex operations available
  futex: make futex_parse_waitv() available as a helper
  futex: add wake_data to struct futex_q
  io_uring: add support for futex wake and wait
  futex: abstract out a __futex_wake_mark() helper
  futex: factor out the futex wake handling
  futex: move FUTEX2_VALID_MASK to futex.h
This commit is contained in:
Linus Torvalds 2023-11-01 11:25:08 -10:00
commit 4de520f1fc
13 changed files with 546 additions and 28 deletions

View File

@ -321,6 +321,11 @@ struct io_ring_ctx {
struct hlist_head waitid_list;
#ifdef CONFIG_FUTEX
struct hlist_head futex_list;
struct io_alloc_cache futex_cache;
#endif
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */

View File

@ -70,6 +70,7 @@ struct io_uring_sqe {
__u32 msg_ring_flags;
__u32 uring_cmd_flags;
__u32 waitid_flags;
__u32 futex_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@ -249,6 +250,9 @@ enum io_uring_op {
IORING_OP_SENDMSG_ZC,
IORING_OP_READ_MULTISHOT,
IORING_OP_WAITID,
IORING_OP_FUTEX_WAIT,
IORING_OP_FUTEX_WAKE,
IORING_OP_FUTEX_WAITV,
/* this goes last, obviously */
IORING_OP_LAST,

View File

@ -10,3 +10,4 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o \
notif.o waitid.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o

View File

@ -16,6 +16,7 @@
#include "poll.h"
#include "timeout.h"
#include "waitid.h"
#include "futex.h"
#include "cancel.h"
struct io_cancel {
@ -124,6 +125,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
if (ret != -ENOENT)
return ret;
ret = io_futex_cancel(ctx, cd, issue_flags);
if (ret != -ENOENT)
return ret;
spin_lock(&ctx->completion_lock);
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
ret = io_timeout_cancel(ctx, cd);

View File

@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef IORING_CANCEL_H
#define IORING_CANCEL_H
#include <linux/io_uring_types.h>
@ -22,3 +24,5 @@ void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
#endif

386
io_uring/futex.c Normal file
View File

@ -0,0 +1,386 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "../kernel/futex/futex.h"
#include "io_uring.h"
#include "rsrc.h"
#include "futex.h"
struct io_futex {
struct file *file;
union {
u32 __user *uaddr;
struct futex_waitv __user *uwaitv;
};
unsigned long futex_val;
unsigned long futex_mask;
unsigned long futexv_owned;
u32 futex_flags;
unsigned int futex_nr;
bool futexv_unqueued;
};
struct io_futex_data {
union {
struct futex_q q;
struct io_cache_entry cache;
};
struct io_kiocb *req;
};
void io_futex_cache_init(struct io_ring_ctx *ctx)
{
io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_futex_data));
}
static void io_futex_cache_entry_free(struct io_cache_entry *entry)
{
kfree(container_of(entry, struct io_futex_data, cache));
}
void io_futex_cache_free(struct io_ring_ctx *ctx)
{
io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free);
}
static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
req->async_data = NULL;
hlist_del_init(&req->hash_node);
io_req_task_complete(req, ts);
}
static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_futex_data *ifd = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
io_tw_lock(ctx, ts);
if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache))
kfree(ifd);
__io_futex_complete(req, ts);
}
static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
struct futex_vector *futexv = req->async_data;
io_tw_lock(req->ctx, ts);
if (!iof->futexv_unqueued) {
int res;
res = futex_unqueue_multiple(futexv, iof->futex_nr);
if (res != -1)
io_req_set_res(req, res, 0);
}
kfree(req->async_data);
req->flags &= ~REQ_F_ASYNC_DATA;
__io_futex_complete(req, ts);
}
static bool io_futexv_claim(struct io_futex *iof)
{
if (test_bit(0, &iof->futexv_owned) ||
test_and_set_bit_lock(0, &iof->futexv_owned))
return false;
return true;
}
static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
{
/* futex wake already done or in progress */
if (req->opcode == IORING_OP_FUTEX_WAIT) {
struct io_futex_data *ifd = req->async_data;
if (!futex_unqueue(&ifd->q))
return false;
req->io_task_work.func = io_futex_complete;
} else {
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
if (!io_futexv_claim(iof))
return false;
req->io_task_work.func = io_futexv_complete;
}
hlist_del_init(&req->hash_node);
io_req_set_res(req, -ECANCELED, 0);
io_req_task_work_add(req);
return true;
}
int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned int issue_flags)
{
struct hlist_node *tmp;
struct io_kiocb *req;
int nr = 0;
if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
return -ENOENT;
io_ring_submit_lock(ctx, issue_flags);
hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
if (req->cqe.user_data != cd->data &&
!(cd->flags & IORING_ASYNC_CANCEL_ANY))
continue;
if (__io_futex_cancel(ctx, req))
nr++;
if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
break;
}
io_ring_submit_unlock(ctx, issue_flags);
if (nr)
return nr;
return -ENOENT;
}
bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
bool found = false;
lockdep_assert_held(&ctx->uring_lock);
hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
if (!io_match_task_safe(req, task, cancel_all))
continue;
__io_futex_cancel(ctx, req);
found = true;
}
return found;
}
int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
u32 flags;
if (unlikely(sqe->len || sqe->futex_flags || sqe->buf_index ||
sqe->file_index))
return -EINVAL;
iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
iof->futex_val = READ_ONCE(sqe->addr2);
iof->futex_mask = READ_ONCE(sqe->addr3);
flags = READ_ONCE(sqe->fd);
if (flags & ~FUTEX2_VALID_MASK)
return -EINVAL;
iof->futex_flags = futex2_to_flags(flags);
if (!futex_flags_valid(iof->futex_flags))
return -EINVAL;
if (!futex_validate_input(iof->futex_flags, iof->futex_val) ||
!futex_validate_input(iof->futex_flags, iof->futex_mask))
return -EINVAL;
return 0;
}
static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q)
{
struct io_kiocb *req = q->wake_data;
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
if (!io_futexv_claim(iof))
return;
if (unlikely(!__futex_wake_mark(q)))
return;
io_req_set_res(req, 0, 0);
req->io_task_work.func = io_futexv_complete;
io_req_task_work_add(req);
}
int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
struct futex_vector *futexv;
int ret;
/* No flags or mask supported for waitv */
if (unlikely(sqe->fd || sqe->buf_index || sqe->file_index ||
sqe->addr2 || sqe->futex_flags || sqe->addr3))
return -EINVAL;
iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
iof->futex_nr = READ_ONCE(sqe->len);
if (!iof->futex_nr || iof->futex_nr > FUTEX_WAITV_MAX)
return -EINVAL;
futexv = kcalloc(iof->futex_nr, sizeof(*futexv), GFP_KERNEL);
if (!futexv)
return -ENOMEM;
ret = futex_parse_waitv(futexv, iof->uwaitv, iof->futex_nr,
io_futex_wakev_fn, req);
if (ret) {
kfree(futexv);
return ret;
}
iof->futexv_owned = 0;
iof->futexv_unqueued = 0;
req->flags |= REQ_F_ASYNC_DATA;
req->async_data = futexv;
return 0;
}
static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
{
struct io_futex_data *ifd = container_of(q, struct io_futex_data, q);
struct io_kiocb *req = ifd->req;
if (unlikely(!__futex_wake_mark(q)))
return;
io_req_set_res(req, 0, 0);
req->io_task_work.func = io_futex_complete;
io_req_task_work_add(req);
}
static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
{
struct io_cache_entry *entry;
entry = io_alloc_cache_get(&ctx->futex_cache);
if (entry)
return container_of(entry, struct io_futex_data, cache);
return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
}
int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
struct futex_vector *futexv = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
int ret, woken = -1;
io_ring_submit_lock(ctx, issue_flags);
ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken);
/*
* Error case, ret is < 0. Mark the request as failed.
*/
if (unlikely(ret < 0)) {
io_ring_submit_unlock(ctx, issue_flags);
req_set_fail(req);
io_req_set_res(req, ret, 0);
kfree(futexv);
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
return IOU_OK;
}
/*
* 0 return means that we successfully setup the waiters, and that
* nobody triggered a wakeup while we were doing so. If the wakeup
* happened post setup, the task_work will be run post this issue and
* under the submission lock. 1 means We got woken while setting up,
* let that side do the completion. Note that
* futex_wait_multiple_setup() will have unqueued all the futexes in
* this case. Mark us as having done that already, since this is
* different from normal wakeup.
*/
if (!ret) {
/*
* If futex_wait_multiple_setup() returns 0 for a
* successful setup, then the task state will not be
* runnable. This is fine for the sync syscall, as
* it'll be blocking unless we already got one of the
* futexes woken, but it obviously won't work for an
* async invocation. Mark us runnable again.
*/
__set_current_state(TASK_RUNNING);
hlist_add_head(&req->hash_node, &ctx->futex_list);
} else {
iof->futexv_unqueued = 1;
if (woken != -1)
io_req_set_res(req, woken, 0);
}
io_ring_submit_unlock(ctx, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
struct io_ring_ctx *ctx = req->ctx;
struct io_futex_data *ifd = NULL;
struct futex_hash_bucket *hb;
int ret;
if (!iof->futex_mask) {
ret = -EINVAL;
goto done;
}
io_ring_submit_lock(ctx, issue_flags);
ifd = io_alloc_ifd(ctx);
if (!ifd) {
ret = -ENOMEM;
goto done_unlock;
}
req->async_data = ifd;
ifd->q = futex_q_init;
ifd->q.bitset = iof->futex_mask;
ifd->q.wake = io_futex_wake_fn;
ifd->req = req;
ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags,
&ifd->q, &hb);
if (!ret) {
hlist_add_head(&req->hash_node, &ctx->futex_list);
io_ring_submit_unlock(ctx, issue_flags);
futex_queue(&ifd->q, hb);
return IOU_ISSUE_SKIP_COMPLETE;
}
done_unlock:
io_ring_submit_unlock(ctx, issue_flags);
done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
kfree(ifd);
return IOU_OK;
}
int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
int ret;
/*
* Strict flags - ensure that waking 0 futexes yields a 0 result.
* See commit 43adf8449510 ("futex: FLAGS_STRICT") for details.
*/
ret = futex_wake(iof->uaddr, FLAGS_STRICT | iof->futex_flags,
iof->futex_val, iof->futex_mask);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}

36
io_uring/futex.h Normal file
View File

@ -0,0 +1,36 @@
// SPDX-License-Identifier: GPL-2.0
#include "cancel.h"
int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags);
int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags);
int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags);
#if defined(CONFIG_FUTEX)
int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned int issue_flags);
bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
bool cancel_all);
void io_futex_cache_init(struct io_ring_ctx *ctx);
void io_futex_cache_free(struct io_ring_ctx *ctx);
#else
static inline int io_futex_cancel(struct io_ring_ctx *ctx,
struct io_cancel_data *cd,
unsigned int issue_flags)
{
return 0;
}
static inline bool io_futex_remove_all(struct io_ring_ctx *ctx,
struct task_struct *task, bool cancel_all)
{
return false;
}
static inline void io_futex_cache_init(struct io_ring_ctx *ctx)
{
}
static inline void io_futex_cache_free(struct io_ring_ctx *ctx)
{
}
#endif

View File

@ -93,6 +93,7 @@
#include "net.h"
#include "notif.h"
#include "waitid.h"
#include "futex.h"
#include "timeout.h"
#include "poll.h"
@ -330,6 +331,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
sizeof(struct async_poll));
io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_msghdr));
io_futex_cache_init(ctx);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
@ -349,6 +351,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->submit_state.free_list.next = NULL;
INIT_WQ_LIST(&ctx->locked_free_list);
INIT_HLIST_HEAD(&ctx->waitid_list);
#ifdef CONFIG_FUTEX
INIT_HLIST_HEAD(&ctx->futex_list);
#endif
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
@ -2914,6 +2919,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_eventfd_unregister(ctx);
io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
mutex_unlock(&ctx->uring_lock);
if (ctx->sq_creds)
@ -3357,6 +3363,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all);
ret |= io_waitid_remove_all(ctx, task, cancel_all);
ret |= io_futex_remove_all(ctx, task, cancel_all);
ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, task, cancel_all);

View File

@ -34,6 +34,7 @@
#include "cancel.h"
#include "rw.h"
#include "waitid.h"
#include "futex.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
@ -444,6 +445,30 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_waitid_prep,
.issue = io_waitid,
},
[IORING_OP_FUTEX_WAIT] = {
#if defined(CONFIG_FUTEX)
.prep = io_futex_prep,
.issue = io_futex_wait,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_FUTEX_WAKE] = {
#if defined(CONFIG_FUTEX)
.prep = io_futex_prep,
.issue = io_futex_wake,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_FUTEX_WAITV] = {
#if defined(CONFIG_FUTEX)
.prep = io_futexv_prep,
.issue = io_futexv_wait,
#else
.prep = io_eopnotsupp_prep,
#endif
},
};
const struct io_cold_def io_cold_defs[] = {
@ -670,6 +695,15 @@ const struct io_cold_def io_cold_defs[] = {
.name = "WAITID",
.async_size = sizeof(struct io_waitid_async),
},
[IORING_OP_FUTEX_WAIT] = {
.name = "FUTEX_WAIT",
},
[IORING_OP_FUTEX_WAKE] = {
.name = "FUTEX_WAKE",
},
[IORING_OP_FUTEX_WAITV] = {
.name = "FUTEX_WAITV",
},
};
const char *io_uring_get_opcode(u8 opcode)

View File

@ -52,6 +52,8 @@ static inline unsigned int futex_to_flags(unsigned int op)
return flags;
}
#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
/* FUTEX2_ to FLAGS_ */
static inline unsigned int futex2_to_flags(unsigned int flags2)
{
@ -137,11 +139,16 @@ struct futex_pi_state {
union futex_key key;
} __randomize_layout;
struct futex_q;
typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
* @list: priority-sorted list of tasks waiting on this futex
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
* @wake: the wake handler for this queue
* @wake_data: data associated with the wake handler
* @key: the key the futex is hashed on
* @pi_state: optional priority inheritance state
* @rt_waiter: rt_waiter storage for use with requeue_pi
@ -166,6 +173,8 @@ struct futex_q {
struct task_struct *task;
spinlock_t *lock_ptr;
futex_wake_fn *wake;
void *wake_data;
union futex_key key;
struct futex_pi_state *pi_state;
struct rt_mutex_waiter *rt_waiter;
@ -212,6 +221,7 @@ extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb);
extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
struct hrtimer_sleeper *timeout);
extern bool __futex_wake_mark(struct futex_q *q);
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
extern int fault_in_user_writeable(u32 __user *uaddr);
@ -351,6 +361,16 @@ struct futex_vector {
struct futex_q q;
};
extern int futex_parse_waitv(struct futex_vector *futexv,
struct futex_waitv __user *uwaitv,
unsigned int nr_futexes, futex_wake_fn *wake,
void *wake_data);
extern int futex_wait_multiple_setup(struct futex_vector *vs, int count,
int *woken);
extern int futex_unqueue_multiple(struct futex_vector *v, int count);
extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
struct hrtimer_sleeper *to);

View File

@ -58,6 +58,7 @@ enum {
const struct futex_q futex_q_init = {
/* list gets initialized in futex_queue()*/
.wake = futex_wake_mark,
.key = FUTEX_KEY_INIT,
.bitset = FUTEX_BITSET_MATCH_ANY,
.requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
@ -593,7 +594,7 @@ retry_private:
/* Plain futexes just wake or requeue and are done */
if (!requeue_pi) {
if (++task_count <= nr_wake)
futex_wake_mark(&wake_q, this);
this->wake(&wake_q, this);
else
requeue_futex(this, hb1, hb2, &key2);
continue;

View File

@ -179,19 +179,20 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
/**
* futex_parse_waitv - Parse a waitv array from userspace
* @futexv: Kernel side list of waiters to be filled
* @uwaitv: Userspace list to be parsed
* @nr_futexes: Length of futexv
* @wake: Wake to call when futex is woken
* @wake_data: Data for the wake handler
*
* Return: Error code on failure, 0 on success
*/
static int futex_parse_waitv(struct futex_vector *futexv,
struct futex_waitv __user *uwaitv,
unsigned int nr_futexes)
int futex_parse_waitv(struct futex_vector *futexv,
struct futex_waitv __user *uwaitv,
unsigned int nr_futexes, futex_wake_fn *wake,
void *wake_data)
{
struct futex_waitv aux;
unsigned int i;
@ -216,6 +217,8 @@ static int futex_parse_waitv(struct futex_vector *futexv,
futexv[i].w.val = aux.val;
futexv[i].w.uaddr = aux.uaddr;
futexv[i].q = futex_q_init;
futexv[i].q.wake = wake;
futexv[i].q.wake_data = wake_data;
}
return 0;
@ -308,7 +311,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
goto destroy_timer;
}
ret = futex_parse_waitv(futexv, waiters, nr_futexes);
ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
NULL);
if (!ret)
ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
@ -423,7 +427,7 @@ SYSCALL_DEFINE4(futex_requeue,
if (!waiters)
return -EINVAL;
ret = futex_parse_waitv(futexes, waiters, 2);
ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
if (ret)
return ret;

View File

@ -106,6 +106,24 @@
* double_lock_hb() and double_unlock_hb(), respectively.
*/
bool __futex_wake_mark(struct futex_q *q)
{
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
return false;
__futex_unqueue(q);
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
* is written, without taking any locks. This is possible in the event
* of a spurious wakeup, for example. A memory barrier is required here
* to prevent the following store to lock_ptr from getting ahead of the
* plist_del in __futex_unqueue().
*/
smp_store_release(&q->lock_ptr, NULL);
return true;
}
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed. Callers
@ -116,19 +134,12 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
{
struct task_struct *p = q->task;
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
return;
get_task_struct(p);
__futex_unqueue(q);
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
* is written, without taking any locks. This is possible in the event
* of a spurious wakeup, for example. A memory barrier is required here
* to prevent the following store to lock_ptr from getting ahead of the
* plist_del in __futex_unqueue().
*/
smp_store_release(&q->lock_ptr, NULL);
if (!__futex_wake_mark(q)) {
put_task_struct(p);
return;
}
/*
* Queue the task for later wakeup for after we've released
@ -177,7 +188,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
futex_wake_mark(&wake_q, this);
this->wake(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@ -292,7 +303,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
futex_wake_mark(&wake_q, this);
this->wake(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@ -306,7 +317,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
futex_wake_mark(&wake_q, this);
this->wake(&wake_q, this);
if (++op_ret >= nr_wake2)
break;
}
@ -361,7 +372,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
}
/**
* unqueue_multiple - Remove various futexes from their hash bucket
* futex_unqueue_multiple - Remove various futexes from their hash bucket
* @v: The list of futexes to unqueue
* @count: Number of futexes in the list
*
@ -371,7 +382,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* - >=0 - Index of the last futex that was awoken;
* - -1 - No futex was awoken
*/
static int unqueue_multiple(struct futex_vector *v, int count)
int futex_unqueue_multiple(struct futex_vector *v, int count)
{
int ret = -1, i;
@ -399,7 +410,7 @@ static int unqueue_multiple(struct futex_vector *v, int count)
* - 0 - Success
* - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
*/
static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
{
struct futex_hash_bucket *hb;
bool retry = false;
@ -461,7 +472,7 @@ retry:
* was woken, we don't return error and return this index to
* userspace
*/
*woken = unqueue_multiple(vs, i);
*woken = futex_unqueue_multiple(vs, i);
if (*woken >= 0)
return 1;
@ -546,7 +557,7 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
__set_current_state(TASK_RUNNING);
ret = unqueue_multiple(vs, count);
ret = futex_unqueue_multiple(vs, count);
if (ret >= 0)
return ret;