blk-mq: remove hybrid polling

io_uring provides the only way user space can poll completions, and that
always sets BLK_POLL_NOSLEEP. This effectively makes hybrid polling dead
code, so remove it and everything supporting it.

Hybrid polling was effectively killed off with 9650b453a3, "block:
ignore RWF_HIPRI hint for sync dio", but still potentially reachable
through io_uring until d729cf9acb, "io_uring: don't sleep when
polling for I/O", but hybrid polling probably should not have been
reachable through that async interface from the beginning.

Fixes: 9650b453a3 ("block: ignore RWF_HIPRI hint for sync dio")
Fixes: d729cf9acb ("io_uring: don't sleep when polling for I/O")
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20230320194926.3353144-1-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Keith Busch 2023-03-20 12:49:26 -07:00 committed by Jens Axboe
parent 4cf2c3ab2c
commit 54bdd67d0f
9 changed files with 12 additions and 299 deletions

View File

@ -336,18 +336,11 @@ What: /sys/block/<disk>/queue/io_poll_delay
Date: November 2016
Contact: linux-block@vger.kernel.org
Description:
[RW] If polling is enabled, this controls what kind of polling
will be performed. It defaults to -1, which is classic polling.
[RW] This was used to control what kind of polling will be
performed. It is now fixed to -1, which is classic polling.
In this mode, the CPU will repeatedly ask for completions
without giving up any time. If set to 0, a hybrid polling mode
is used, where the kernel will attempt to make an educated guess
at when the IO will complete. Based on this guess, the kernel
will put the process issuing IO to sleep for an amount of time,
before entering a classic poll loop. This mode might be a little
slower than pure classic polling, but it will be more efficient.
If set to a value larger than 0, the kernel will put the process
issuing IO to sleep for this amount of microseconds before
entering classic polling.
without giving up any time.
<deprecated>
What: /sys/block/<disk>/queue/io_timeout

View File

@ -263,13 +263,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
static void blk_free_queue(struct request_queue *q)
{
if (q->poll_stat)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
blk_free_queue_stats(q->stats);
kfree(q->poll_stat);
if (queue_is_mq(q))
blk_mq_release(q);

View File

@ -15,33 +15,8 @@
#include "blk-mq-tag.h"
#include "blk-rq-qos.h"
static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
{
if (stat->nr_samples) {
seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu",
stat->nr_samples, stat->mean, stat->min, stat->max);
} else {
seq_puts(m, "samples=0");
}
}
static int queue_poll_stat_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
int bucket;
if (!q->poll_stat)
return 0;
for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
print_stat(m, &q->poll_stat[2 * bucket]);
seq_puts(m, "\n");
seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket));
print_stat(m, &q->poll_stat[2 * bucket + 1]);
seq_puts(m, "\n");
}
return 0;
}
@ -282,7 +257,6 @@ static const char *const rqf_name[] = {
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(MQ_POLL_SLEPT),
RQF_NAME(TIMED_OUT),
RQF_NAME(ELV),
RQF_NAME(RESV),

View File

@ -46,51 +46,15 @@
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
int ddir, sectors, bucket;
ddir = rq_data_dir(rq);
sectors = blk_rq_stats_sectors(rq);
bucket = ddir + 2 * ilog2(sectors);
if (bucket < 0)
return -1;
else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
return bucket;
}
#define BLK_QC_T_SHIFT 16
#define BLK_QC_T_INTERNAL (1U << 31)
static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
blk_qc_t qc)
{
return xa_load(&q->hctx_table,
(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
}
static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
blk_qc_t qc)
{
unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
if (qc & BLK_QC_T_INTERNAL)
return blk_mq_tag_to_rq(hctx->sched_tags, tag);
return blk_mq_tag_to_rq(hctx->tags, tag);
return xa_load(&q->hctx_table, qc);
}
static inline blk_qc_t blk_rq_to_qc(struct request *rq)
{
return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
(rq->tag != -1 ?
rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
return rq->mq_hctx->queue_num;
}
/*
@ -1038,10 +1002,8 @@ static inline void blk_account_io_start(struct request *req)
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
{
if (rq->rq_flags & RQF_STATS) {
blk_mq_poll_stats_start(rq->q);
if (rq->rq_flags & RQF_STATS)
blk_stat_add(rq, now);
}
blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
@ -4222,14 +4184,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* mark the queue as mq asap */
q->mq_ops = set->ops;
q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
blk_mq_poll_stats_bkt,
BLK_MQ_POLL_STATS_BKTS, q);
if (!q->poll_cb)
goto err_exit;
if (blk_mq_alloc_ctxs(q))
goto err_poll;
goto err_exit;
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q);
@ -4257,11 +4213,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->nr_requests = set->queue_depth;
/*
* Default to classic polling
*/
q->poll_nsec = BLK_MQ_POLL_CLASSIC;
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
@ -4269,9 +4220,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
err_hctxs:
blk_mq_release(q);
err_poll:
blk_stat_free_callback(q->poll_cb);
q->poll_cb = NULL;
err_exit:
q->mq_ops = NULL;
return -ENOMEM;
@ -4768,138 +4716,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
/* Enable polling stats and return whether they were already enabled. */
static bool blk_poll_stats_enable(struct request_queue *q)
{
if (q->poll_stat)
return true;
return blk_stats_alloc_enable(q);
}
static void blk_mq_poll_stats_start(struct request_queue *q)
{
/*
* We don't arm the callback if polling stats are not enabled or the
* callback is already active.
*/
if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
return;
blk_stat_activate_msecs(q->poll_cb, 100);
}
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
{
struct request_queue *q = cb->data;
int bucket;
for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
if (cb->stat[bucket].nr_samples)
q->poll_stat[bucket] = cb->stat[bucket];
}
}
static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
struct request *rq)
{
unsigned long ret = 0;
int bucket;
/*
* If stats collection isn't on, don't sleep but turn it on for
* future users
*/
if (!blk_poll_stats_enable(q))
return 0;
/*
* As an optimistic guess, use half of the mean service time
* for this type of request. We can (and should) make this smarter.
* For instance, if the completion latencies are tight, we can
* get closer than just half the mean. This is especially
* important on devices where the completion latencies are longer
* than ~10 usec. We do use the stats for the relevant IO size
* if available which does lead to better estimates.
*/
bucket = blk_mq_poll_stats_bkt(rq);
if (bucket < 0)
return ret;
if (q->poll_stat[bucket].nr_samples)
ret = (q->poll_stat[bucket].mean + 1) / 2;
return ret;
}
static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
{
struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
struct request *rq = blk_qc_to_rq(hctx, qc);
struct hrtimer_sleeper hs;
enum hrtimer_mode mode;
unsigned int nsecs;
ktime_t kt;
/*
* If a request has completed on queue that uses an I/O scheduler, we
* won't get back a request from blk_qc_to_rq.
*/
if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
return false;
/*
* If we get here, hybrid polling is enabled. Hence poll_nsec can be:
*
* 0: use half of prev avg
* >0: use this specific value
*/
if (q->poll_nsec > 0)
nsecs = q->poll_nsec;
else
nsecs = blk_mq_poll_nsecs(q, rq);
if (!nsecs)
return false;
rq->rq_flags |= RQF_MQ_POLL_SLEPT;
/*
* This will be replaced with the stats tracking code, using
* 'avg_completion_time / 2' as the pre-sleep target.
*/
kt = nsecs;
mode = HRTIMER_MODE_REL;
hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
hrtimer_set_expires(&hs.timer, kt);
do {
if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
break;
set_current_state(TASK_UNINTERRUPTIBLE);
hrtimer_sleeper_start_expires(&hs, mode);
if (hs.task)
io_schedule();
hrtimer_cancel(&hs.timer);
mode = HRTIMER_MODE_ABS;
} while (hs.task && !signal_pending(current));
__set_current_state(TASK_RUNNING);
destroy_hrtimer_on_stack(&hs.timer);
/*
* If we sleep, have the caller restart the poll loop to reset the
* state. Like for the other success return cases, the caller is
* responsible for checking if the IO completed. If the IO isn't
* complete, we'll get called again and will go straight to the busy
* poll loop.
*/
return true;
}
static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
struct io_comp_batch *iob, unsigned int flags)
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
unsigned int flags)
{
struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
long state = get_current_state();
@ -4926,17 +4744,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
return 0;
}
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
unsigned int flags)
{
if (!(flags & BLK_POLL_NOSLEEP) &&
q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
if (blk_mq_poll_hybrid(q, cookie))
return 1;
}
return blk_mq_poll_classic(q, cookie, iob, flags);
}
unsigned int blk_mq_rq_cpu(struct request *rq)
{
return rq->mq_ctx->cpu;

View File

@ -231,21 +231,3 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
kfree(stats);
}
bool blk_stats_alloc_enable(struct request_queue *q)
{
struct blk_rq_stat *poll_stat;
poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
GFP_ATOMIC);
if (!poll_stat)
return false;
if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
kfree(poll_stat);
return true;
}
blk_stat_add_callback(q, q->poll_cb);
return false;
}

View File

@ -408,35 +408,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
{
int val;
if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
val = BLK_MQ_POLL_CLASSIC;
else
val = q->poll_nsec / 1000;
return sprintf(page, "%d\n", val);
return sprintf(page, "%d\n", -1);
}
static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
size_t count)
{
int err, val;
if (!q->mq_ops || !q->mq_ops->poll)
return -EINVAL;
err = kstrtoint(page, 10, &val);
if (err < 0)
return err;
if (val == BLK_MQ_POLL_CLASSIC)
q->poll_nsec = BLK_MQ_POLL_CLASSIC;
else if (val >= 0)
q->poll_nsec = val * 1000;
else
return -EINVAL;
return count;
}

View File

@ -57,8 +57,6 @@ typedef __u32 __bitwise req_flags_t;
#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
/* The per-zone write lock is held for this request */
#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
/* already slept for hybrid poll */
#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20))
/* ->timeout has been called, don't expire again */
#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
/* queue has elevator attached */

View File

@ -44,12 +44,6 @@ extern const struct device_type disk_type;
extern struct device_type part_type;
extern struct class block_class;
/* Must be consistent with blk_mq_poll_stats_bkt() */
#define BLK_MQ_POLL_STATS_BKTS 16
/* Doing classic polling */
#define BLK_MQ_POLL_CLASSIC -1
/*
* Maximum number of blkcg policies allowed to be registered concurrently.
* Defined here to simplify include dependency.
@ -468,10 +462,6 @@ struct request_queue {
#endif
unsigned int rq_timeout;
int poll_nsec;
struct blk_stat_callback *poll_cb;
struct blk_rq_stat *poll_stat;
struct timer_list timeout;
struct work_struct timeout_work;
@ -870,8 +860,6 @@ blk_status_t errno_to_blk_status(int errno);
/* only poll the hardware once, don't continue until a completion was found */
#define BLK_POLL_ONESHOT (1 << 0)
/* do not sleep to wait for the expected completion time */
#define BLK_POLL_NOSLEEP (1 << 1)
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
unsigned int flags);

View File

@ -1002,7 +1002,7 @@ void io_rw_fail(struct io_kiocb *req)
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
struct io_wq_work_node *pos, *start, *prev;
unsigned int poll_flags = BLK_POLL_NOSLEEP;
unsigned int poll_flags = 0;
DEFINE_IO_COMP_BATCH(iob);
int nr_events = 0;