162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * blk-mq scheduling framework
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2016 Jens Axboe
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci#include <linux/kernel.h>
862306a36Sopenharmony_ci#include <linux/module.h>
962306a36Sopenharmony_ci#include <linux/list_sort.h>
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci#include <trace/events/block.h>
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci#include "blk.h"
1462306a36Sopenharmony_ci#include "blk-mq.h"
1562306a36Sopenharmony_ci#include "blk-mq-debugfs.h"
1662306a36Sopenharmony_ci#include "blk-mq-sched.h"
1762306a36Sopenharmony_ci#include "blk-wbt.h"
1862306a36Sopenharmony_ci
1962306a36Sopenharmony_ci/*
2062306a36Sopenharmony_ci * Mark a hardware queue as needing a restart.
2162306a36Sopenharmony_ci */
2262306a36Sopenharmony_civoid blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
2362306a36Sopenharmony_ci{
2462306a36Sopenharmony_ci	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2562306a36Sopenharmony_ci		return;
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci	set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
2862306a36Sopenharmony_ci}
2962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_civoid __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
3262306a36Sopenharmony_ci{
3362306a36Sopenharmony_ci	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci	/*
3662306a36Sopenharmony_ci	 * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
3762306a36Sopenharmony_ci	 * in blk_mq_run_hw_queue(). Its pair is the barrier in
3862306a36Sopenharmony_ci	 * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
3962306a36Sopenharmony_ci	 * meantime new request added to hctx->dispatch is missed to check in
4062306a36Sopenharmony_ci	 * blk_mq_run_hw_queue().
4162306a36Sopenharmony_ci	 */
4262306a36Sopenharmony_ci	smp_mb();
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci	blk_mq_run_hw_queue(hctx, true);
4562306a36Sopenharmony_ci}
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_cistatic int sched_rq_cmp(void *priv, const struct list_head *a,
4862306a36Sopenharmony_ci			const struct list_head *b)
4962306a36Sopenharmony_ci{
5062306a36Sopenharmony_ci	struct request *rqa = container_of(a, struct request, queuelist);
5162306a36Sopenharmony_ci	struct request *rqb = container_of(b, struct request, queuelist);
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci	return rqa->mq_hctx > rqb->mq_hctx;
5462306a36Sopenharmony_ci}
5562306a36Sopenharmony_ci
5662306a36Sopenharmony_cistatic bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
5762306a36Sopenharmony_ci{
5862306a36Sopenharmony_ci	struct blk_mq_hw_ctx *hctx =
5962306a36Sopenharmony_ci		list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
6062306a36Sopenharmony_ci	struct request *rq;
6162306a36Sopenharmony_ci	LIST_HEAD(hctx_list);
6262306a36Sopenharmony_ci	unsigned int count = 0;
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci	list_for_each_entry(rq, rq_list, queuelist) {
6562306a36Sopenharmony_ci		if (rq->mq_hctx != hctx) {
6662306a36Sopenharmony_ci			list_cut_before(&hctx_list, rq_list, &rq->queuelist);
6762306a36Sopenharmony_ci			goto dispatch;
6862306a36Sopenharmony_ci		}
6962306a36Sopenharmony_ci		count++;
7062306a36Sopenharmony_ci	}
7162306a36Sopenharmony_ci	list_splice_tail_init(rq_list, &hctx_list);
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_cidispatch:
7462306a36Sopenharmony_ci	return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
7562306a36Sopenharmony_ci}
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci#define BLK_MQ_BUDGET_DELAY	3		/* ms units */
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci/*
8062306a36Sopenharmony_ci * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
8162306a36Sopenharmony_ci * its queue by itself in its completion handler, so we don't need to
8262306a36Sopenharmony_ci * restart queue if .get_budget() fails to get the budget.
8362306a36Sopenharmony_ci *
8462306a36Sopenharmony_ci * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
8562306a36Sopenharmony_ci * be run again.  This is necessary to avoid starving flushes.
8662306a36Sopenharmony_ci */
8762306a36Sopenharmony_cistatic int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
8862306a36Sopenharmony_ci{
8962306a36Sopenharmony_ci	struct request_queue *q = hctx->queue;
9062306a36Sopenharmony_ci	struct elevator_queue *e = q->elevator;
9162306a36Sopenharmony_ci	bool multi_hctxs = false, run_queue = false;
9262306a36Sopenharmony_ci	bool dispatched = false, busy = false;
9362306a36Sopenharmony_ci	unsigned int max_dispatch;
9462306a36Sopenharmony_ci	LIST_HEAD(rq_list);
9562306a36Sopenharmony_ci	int count = 0;
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci	if (hctx->dispatch_busy)
9862306a36Sopenharmony_ci		max_dispatch = 1;
9962306a36Sopenharmony_ci	else
10062306a36Sopenharmony_ci		max_dispatch = hctx->queue->nr_requests;
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_ci	do {
10362306a36Sopenharmony_ci		struct request *rq;
10462306a36Sopenharmony_ci		int budget_token;
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci		if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
10762306a36Sopenharmony_ci			break;
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci		if (!list_empty_careful(&hctx->dispatch)) {
11062306a36Sopenharmony_ci			busy = true;
11162306a36Sopenharmony_ci			break;
11262306a36Sopenharmony_ci		}
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci		budget_token = blk_mq_get_dispatch_budget(q);
11562306a36Sopenharmony_ci		if (budget_token < 0)
11662306a36Sopenharmony_ci			break;
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci		rq = e->type->ops.dispatch_request(hctx);
11962306a36Sopenharmony_ci		if (!rq) {
12062306a36Sopenharmony_ci			blk_mq_put_dispatch_budget(q, budget_token);
12162306a36Sopenharmony_ci			/*
12262306a36Sopenharmony_ci			 * We're releasing without dispatching. Holding the
12362306a36Sopenharmony_ci			 * budget could have blocked any "hctx"s with the
12462306a36Sopenharmony_ci			 * same queue and if we didn't dispatch then there's
12562306a36Sopenharmony_ci			 * no guarantee anyone will kick the queue.  Kick it
12662306a36Sopenharmony_ci			 * ourselves.
12762306a36Sopenharmony_ci			 */
12862306a36Sopenharmony_ci			run_queue = true;
12962306a36Sopenharmony_ci			break;
13062306a36Sopenharmony_ci		}
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci		blk_mq_set_rq_budget_token(rq, budget_token);
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci		/*
13562306a36Sopenharmony_ci		 * Now this rq owns the budget which has to be released
13662306a36Sopenharmony_ci		 * if this rq won't be queued to driver via .queue_rq()
13762306a36Sopenharmony_ci		 * in blk_mq_dispatch_rq_list().
13862306a36Sopenharmony_ci		 */
13962306a36Sopenharmony_ci		list_add_tail(&rq->queuelist, &rq_list);
14062306a36Sopenharmony_ci		count++;
14162306a36Sopenharmony_ci		if (rq->mq_hctx != hctx)
14262306a36Sopenharmony_ci			multi_hctxs = true;
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci		/*
14562306a36Sopenharmony_ci		 * If we cannot get tag for the request, stop dequeueing
14662306a36Sopenharmony_ci		 * requests from the IO scheduler. We are unlikely to be able
14762306a36Sopenharmony_ci		 * to submit them anyway and it creates false impression for
14862306a36Sopenharmony_ci		 * scheduling heuristics that the device can take more IO.
14962306a36Sopenharmony_ci		 */
15062306a36Sopenharmony_ci		if (!blk_mq_get_driver_tag(rq))
15162306a36Sopenharmony_ci			break;
15262306a36Sopenharmony_ci	} while (count < max_dispatch);
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci	if (!count) {
15562306a36Sopenharmony_ci		if (run_queue)
15662306a36Sopenharmony_ci			blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
15762306a36Sopenharmony_ci	} else if (multi_hctxs) {
15862306a36Sopenharmony_ci		/*
15962306a36Sopenharmony_ci		 * Requests from different hctx may be dequeued from some
16062306a36Sopenharmony_ci		 * schedulers, such as bfq and deadline.
16162306a36Sopenharmony_ci		 *
16262306a36Sopenharmony_ci		 * Sort the requests in the list according to their hctx,
16362306a36Sopenharmony_ci		 * dispatch batching requests from same hctx at a time.
16462306a36Sopenharmony_ci		 */
16562306a36Sopenharmony_ci		list_sort(NULL, &rq_list, sched_rq_cmp);
16662306a36Sopenharmony_ci		do {
16762306a36Sopenharmony_ci			dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
16862306a36Sopenharmony_ci		} while (!list_empty(&rq_list));
16962306a36Sopenharmony_ci	} else {
17062306a36Sopenharmony_ci		dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
17162306a36Sopenharmony_ci	}
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	if (busy)
17462306a36Sopenharmony_ci		return -EAGAIN;
17562306a36Sopenharmony_ci	return !!dispatched;
17662306a36Sopenharmony_ci}
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_cistatic int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
17962306a36Sopenharmony_ci{
18062306a36Sopenharmony_ci	unsigned long end = jiffies + HZ;
18162306a36Sopenharmony_ci	int ret;
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_ci	do {
18462306a36Sopenharmony_ci		ret = __blk_mq_do_dispatch_sched(hctx);
18562306a36Sopenharmony_ci		if (ret != 1)
18662306a36Sopenharmony_ci			break;
18762306a36Sopenharmony_ci		if (need_resched() || time_is_before_jiffies(end)) {
18862306a36Sopenharmony_ci			blk_mq_delay_run_hw_queue(hctx, 0);
18962306a36Sopenharmony_ci			break;
19062306a36Sopenharmony_ci		}
19162306a36Sopenharmony_ci	} while (1);
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci	return ret;
19462306a36Sopenharmony_ci}
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_cistatic struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
19762306a36Sopenharmony_ci					  struct blk_mq_ctx *ctx)
19862306a36Sopenharmony_ci{
19962306a36Sopenharmony_ci	unsigned short idx = ctx->index_hw[hctx->type];
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	if (++idx == hctx->nr_ctx)
20262306a36Sopenharmony_ci		idx = 0;
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_ci	return hctx->ctxs[idx];
20562306a36Sopenharmony_ci}
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci/*
20862306a36Sopenharmony_ci * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
20962306a36Sopenharmony_ci * its queue by itself in its completion handler, so we don't need to
21062306a36Sopenharmony_ci * restart queue if .get_budget() fails to get the budget.
21162306a36Sopenharmony_ci *
21262306a36Sopenharmony_ci * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
21362306a36Sopenharmony_ci * be run again.  This is necessary to avoid starving flushes.
21462306a36Sopenharmony_ci */
21562306a36Sopenharmony_cistatic int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
21662306a36Sopenharmony_ci{
21762306a36Sopenharmony_ci	struct request_queue *q = hctx->queue;
21862306a36Sopenharmony_ci	LIST_HEAD(rq_list);
21962306a36Sopenharmony_ci	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
22062306a36Sopenharmony_ci	int ret = 0;
22162306a36Sopenharmony_ci	struct request *rq;
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_ci	do {
22462306a36Sopenharmony_ci		int budget_token;
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci		if (!list_empty_careful(&hctx->dispatch)) {
22762306a36Sopenharmony_ci			ret = -EAGAIN;
22862306a36Sopenharmony_ci			break;
22962306a36Sopenharmony_ci		}
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci		if (!sbitmap_any_bit_set(&hctx->ctx_map))
23262306a36Sopenharmony_ci			break;
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci		budget_token = blk_mq_get_dispatch_budget(q);
23562306a36Sopenharmony_ci		if (budget_token < 0)
23662306a36Sopenharmony_ci			break;
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
23962306a36Sopenharmony_ci		if (!rq) {
24062306a36Sopenharmony_ci			blk_mq_put_dispatch_budget(q, budget_token);
24162306a36Sopenharmony_ci			/*
24262306a36Sopenharmony_ci			 * We're releasing without dispatching. Holding the
24362306a36Sopenharmony_ci			 * budget could have blocked any "hctx"s with the
24462306a36Sopenharmony_ci			 * same queue and if we didn't dispatch then there's
24562306a36Sopenharmony_ci			 * no guarantee anyone will kick the queue.  Kick it
24662306a36Sopenharmony_ci			 * ourselves.
24762306a36Sopenharmony_ci			 */
24862306a36Sopenharmony_ci			blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
24962306a36Sopenharmony_ci			break;
25062306a36Sopenharmony_ci		}
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci		blk_mq_set_rq_budget_token(rq, budget_token);
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci		/*
25562306a36Sopenharmony_ci		 * Now this rq owns the budget which has to be released
25662306a36Sopenharmony_ci		 * if this rq won't be queued to driver via .queue_rq()
25762306a36Sopenharmony_ci		 * in blk_mq_dispatch_rq_list().
25862306a36Sopenharmony_ci		 */
25962306a36Sopenharmony_ci		list_add(&rq->queuelist, &rq_list);
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci		/* round robin for fair dispatch */
26262306a36Sopenharmony_ci		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	} while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci	WRITE_ONCE(hctx->dispatch_from, ctx);
26762306a36Sopenharmony_ci	return ret;
26862306a36Sopenharmony_ci}
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_cistatic int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
27162306a36Sopenharmony_ci{
27262306a36Sopenharmony_ci	bool need_dispatch = false;
27362306a36Sopenharmony_ci	LIST_HEAD(rq_list);
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci	/*
27662306a36Sopenharmony_ci	 * If we have previous entries on our dispatch list, grab them first for
27762306a36Sopenharmony_ci	 * more fair dispatch.
27862306a36Sopenharmony_ci	 */
27962306a36Sopenharmony_ci	if (!list_empty_careful(&hctx->dispatch)) {
28062306a36Sopenharmony_ci		spin_lock(&hctx->lock);
28162306a36Sopenharmony_ci		if (!list_empty(&hctx->dispatch))
28262306a36Sopenharmony_ci			list_splice_init(&hctx->dispatch, &rq_list);
28362306a36Sopenharmony_ci		spin_unlock(&hctx->lock);
28462306a36Sopenharmony_ci	}
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	/*
28762306a36Sopenharmony_ci	 * Only ask the scheduler for requests, if we didn't have residual
28862306a36Sopenharmony_ci	 * requests from the dispatch list. This is to avoid the case where
28962306a36Sopenharmony_ci	 * we only ever dispatch a fraction of the requests available because
29062306a36Sopenharmony_ci	 * of low device queue depth. Once we pull requests out of the IO
29162306a36Sopenharmony_ci	 * scheduler, we can no longer merge or sort them. So it's best to
29262306a36Sopenharmony_ci	 * leave them there for as long as we can. Mark the hw queue as
29362306a36Sopenharmony_ci	 * needing a restart in that case.
29462306a36Sopenharmony_ci	 *
29562306a36Sopenharmony_ci	 * We want to dispatch from the scheduler if there was nothing
29662306a36Sopenharmony_ci	 * on the dispatch list or we were able to dispatch from the
29762306a36Sopenharmony_ci	 * dispatch list.
29862306a36Sopenharmony_ci	 */
29962306a36Sopenharmony_ci	if (!list_empty(&rq_list)) {
30062306a36Sopenharmony_ci		blk_mq_sched_mark_restart_hctx(hctx);
30162306a36Sopenharmony_ci		if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0))
30262306a36Sopenharmony_ci			return 0;
30362306a36Sopenharmony_ci		need_dispatch = true;
30462306a36Sopenharmony_ci	} else {
30562306a36Sopenharmony_ci		need_dispatch = hctx->dispatch_busy;
30662306a36Sopenharmony_ci	}
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	if (hctx->queue->elevator)
30962306a36Sopenharmony_ci		return blk_mq_do_dispatch_sched(hctx);
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	/* dequeue request one by one from sw queue if queue is busy */
31262306a36Sopenharmony_ci	if (need_dispatch)
31362306a36Sopenharmony_ci		return blk_mq_do_dispatch_ctx(hctx);
31462306a36Sopenharmony_ci	blk_mq_flush_busy_ctxs(hctx, &rq_list);
31562306a36Sopenharmony_ci	blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
31662306a36Sopenharmony_ci	return 0;
31762306a36Sopenharmony_ci}
31862306a36Sopenharmony_ci
31962306a36Sopenharmony_civoid blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
32062306a36Sopenharmony_ci{
32162306a36Sopenharmony_ci	struct request_queue *q = hctx->queue;
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci	/* RCU or SRCU read lock is needed before checking quiesced flag */
32462306a36Sopenharmony_ci	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
32562306a36Sopenharmony_ci		return;
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	hctx->run++;
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci	/*
33062306a36Sopenharmony_ci	 * A return of -EAGAIN is an indication that hctx->dispatch is not
33162306a36Sopenharmony_ci	 * empty and we must run again in order to avoid starving flushes.
33262306a36Sopenharmony_ci	 */
33362306a36Sopenharmony_ci	if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
33462306a36Sopenharmony_ci		if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
33562306a36Sopenharmony_ci			blk_mq_run_hw_queue(hctx, true);
33662306a36Sopenharmony_ci	}
33762306a36Sopenharmony_ci}
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_cibool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
34062306a36Sopenharmony_ci		unsigned int nr_segs)
34162306a36Sopenharmony_ci{
34262306a36Sopenharmony_ci	struct elevator_queue *e = q->elevator;
34362306a36Sopenharmony_ci	struct blk_mq_ctx *ctx;
34462306a36Sopenharmony_ci	struct blk_mq_hw_ctx *hctx;
34562306a36Sopenharmony_ci	bool ret = false;
34662306a36Sopenharmony_ci	enum hctx_type type;
34762306a36Sopenharmony_ci
34862306a36Sopenharmony_ci	if (e && e->type->ops.bio_merge) {
34962306a36Sopenharmony_ci		ret = e->type->ops.bio_merge(q, bio, nr_segs);
35062306a36Sopenharmony_ci		goto out_put;
35162306a36Sopenharmony_ci	}
35262306a36Sopenharmony_ci
35362306a36Sopenharmony_ci	ctx = blk_mq_get_ctx(q);
35462306a36Sopenharmony_ci	hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
35562306a36Sopenharmony_ci	type = hctx->type;
35662306a36Sopenharmony_ci	if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
35762306a36Sopenharmony_ci	    list_empty_careful(&ctx->rq_lists[type]))
35862306a36Sopenharmony_ci		goto out_put;
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci	/* default per sw-queue merge */
36162306a36Sopenharmony_ci	spin_lock(&ctx->lock);
36262306a36Sopenharmony_ci	/*
36362306a36Sopenharmony_ci	 * Reverse check our software queue for entries that we could
36462306a36Sopenharmony_ci	 * potentially merge with. Currently includes a hand-wavy stop
36562306a36Sopenharmony_ci	 * count of 8, to not spend too much time checking for merges.
36662306a36Sopenharmony_ci	 */
36762306a36Sopenharmony_ci	if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
36862306a36Sopenharmony_ci		ret = true;
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci	spin_unlock(&ctx->lock);
37162306a36Sopenharmony_ciout_put:
37262306a36Sopenharmony_ci	return ret;
37362306a36Sopenharmony_ci}
37462306a36Sopenharmony_ci
37562306a36Sopenharmony_cibool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
37662306a36Sopenharmony_ci				   struct list_head *free)
37762306a36Sopenharmony_ci{
37862306a36Sopenharmony_ci	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
37962306a36Sopenharmony_ci}
38062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
38162306a36Sopenharmony_ci
38262306a36Sopenharmony_cistatic int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
38362306a36Sopenharmony_ci					  struct blk_mq_hw_ctx *hctx,
38462306a36Sopenharmony_ci					  unsigned int hctx_idx)
38562306a36Sopenharmony_ci{
38662306a36Sopenharmony_ci	if (blk_mq_is_shared_tags(q->tag_set->flags)) {
38762306a36Sopenharmony_ci		hctx->sched_tags = q->sched_shared_tags;
38862306a36Sopenharmony_ci		return 0;
38962306a36Sopenharmony_ci	}
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
39262306a36Sopenharmony_ci						    q->nr_requests);
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_ci	if (!hctx->sched_tags)
39562306a36Sopenharmony_ci		return -ENOMEM;
39662306a36Sopenharmony_ci	return 0;
39762306a36Sopenharmony_ci}
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_cistatic void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
40062306a36Sopenharmony_ci{
40162306a36Sopenharmony_ci	blk_mq_free_rq_map(queue->sched_shared_tags);
40262306a36Sopenharmony_ci	queue->sched_shared_tags = NULL;
40362306a36Sopenharmony_ci}
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ci/* called in queue's release handler, tagset has gone away */
40662306a36Sopenharmony_cistatic void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
40762306a36Sopenharmony_ci{
40862306a36Sopenharmony_ci	struct blk_mq_hw_ctx *hctx;
40962306a36Sopenharmony_ci	unsigned long i;
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_ci	queue_for_each_hw_ctx(q, hctx, i) {
41262306a36Sopenharmony_ci		if (hctx->sched_tags) {
41362306a36Sopenharmony_ci			if (!blk_mq_is_shared_tags(flags))
41462306a36Sopenharmony_ci				blk_mq_free_rq_map(hctx->sched_tags);
41562306a36Sopenharmony_ci			hctx->sched_tags = NULL;
41662306a36Sopenharmony_ci		}
41762306a36Sopenharmony_ci	}
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	if (blk_mq_is_shared_tags(flags))
42062306a36Sopenharmony_ci		blk_mq_exit_sched_shared_tags(q);
42162306a36Sopenharmony_ci}
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_cistatic int blk_mq_init_sched_shared_tags(struct request_queue *queue)
42462306a36Sopenharmony_ci{
42562306a36Sopenharmony_ci	struct blk_mq_tag_set *set = queue->tag_set;
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci	/*
42862306a36Sopenharmony_ci	 * Set initial depth at max so that we don't need to reallocate for
42962306a36Sopenharmony_ci	 * updating nr_requests.
43062306a36Sopenharmony_ci	 */
43162306a36Sopenharmony_ci	queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
43262306a36Sopenharmony_ci						BLK_MQ_NO_HCTX_IDX,
43362306a36Sopenharmony_ci						MAX_SCHED_RQ);
43462306a36Sopenharmony_ci	if (!queue->sched_shared_tags)
43562306a36Sopenharmony_ci		return -ENOMEM;
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci	blk_mq_tag_update_sched_shared_tags(queue);
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci	return 0;
44062306a36Sopenharmony_ci}
44162306a36Sopenharmony_ci
44262306a36Sopenharmony_ci/* caller must have a reference to @e, will grab another one if successful */
44362306a36Sopenharmony_ciint blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
44462306a36Sopenharmony_ci{
44562306a36Sopenharmony_ci	unsigned int flags = q->tag_set->flags;
44662306a36Sopenharmony_ci	struct blk_mq_hw_ctx *hctx;
44762306a36Sopenharmony_ci	struct elevator_queue *eq;
44862306a36Sopenharmony_ci	unsigned long i;
44962306a36Sopenharmony_ci	int ret;
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci	/*
45262306a36Sopenharmony_ci	 * Default to double of smaller one between hw queue_depth and 128,
45362306a36Sopenharmony_ci	 * since we don't split into sync/async like the old code did.
45462306a36Sopenharmony_ci	 * Additionally, this is a per-hw queue depth.
45562306a36Sopenharmony_ci	 */
45662306a36Sopenharmony_ci	q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
45762306a36Sopenharmony_ci				   BLKDEV_DEFAULT_RQ);
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci	if (blk_mq_is_shared_tags(flags)) {
46062306a36Sopenharmony_ci		ret = blk_mq_init_sched_shared_tags(q);
46162306a36Sopenharmony_ci		if (ret)
46262306a36Sopenharmony_ci			return ret;
46362306a36Sopenharmony_ci	}
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci	queue_for_each_hw_ctx(q, hctx, i) {
46662306a36Sopenharmony_ci		ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
46762306a36Sopenharmony_ci		if (ret)
46862306a36Sopenharmony_ci			goto err_free_map_and_rqs;
46962306a36Sopenharmony_ci	}
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci	ret = e->ops.init_sched(q, e);
47262306a36Sopenharmony_ci	if (ret)
47362306a36Sopenharmony_ci		goto err_free_map_and_rqs;
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	mutex_lock(&q->debugfs_mutex);
47662306a36Sopenharmony_ci	blk_mq_debugfs_register_sched(q);
47762306a36Sopenharmony_ci	mutex_unlock(&q->debugfs_mutex);
47862306a36Sopenharmony_ci
47962306a36Sopenharmony_ci	queue_for_each_hw_ctx(q, hctx, i) {
48062306a36Sopenharmony_ci		if (e->ops.init_hctx) {
48162306a36Sopenharmony_ci			ret = e->ops.init_hctx(hctx, i);
48262306a36Sopenharmony_ci			if (ret) {
48362306a36Sopenharmony_ci				eq = q->elevator;
48462306a36Sopenharmony_ci				blk_mq_sched_free_rqs(q);
48562306a36Sopenharmony_ci				blk_mq_exit_sched(q, eq);
48662306a36Sopenharmony_ci				kobject_put(&eq->kobj);
48762306a36Sopenharmony_ci				return ret;
48862306a36Sopenharmony_ci			}
48962306a36Sopenharmony_ci		}
49062306a36Sopenharmony_ci		mutex_lock(&q->debugfs_mutex);
49162306a36Sopenharmony_ci		blk_mq_debugfs_register_sched_hctx(q, hctx);
49262306a36Sopenharmony_ci		mutex_unlock(&q->debugfs_mutex);
49362306a36Sopenharmony_ci	}
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_ci	return 0;
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_cierr_free_map_and_rqs:
49862306a36Sopenharmony_ci	blk_mq_sched_free_rqs(q);
49962306a36Sopenharmony_ci	blk_mq_sched_tags_teardown(q, flags);
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_ci	q->elevator = NULL;
50262306a36Sopenharmony_ci	return ret;
50362306a36Sopenharmony_ci}
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci/*
50662306a36Sopenharmony_ci * called in either blk_queue_cleanup or elevator_switch, tagset
50762306a36Sopenharmony_ci * is required for freeing requests
50862306a36Sopenharmony_ci */
50962306a36Sopenharmony_civoid blk_mq_sched_free_rqs(struct request_queue *q)
51062306a36Sopenharmony_ci{
51162306a36Sopenharmony_ci	struct blk_mq_hw_ctx *hctx;
51262306a36Sopenharmony_ci	unsigned long i;
51362306a36Sopenharmony_ci
51462306a36Sopenharmony_ci	if (blk_mq_is_shared_tags(q->tag_set->flags)) {
51562306a36Sopenharmony_ci		blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
51662306a36Sopenharmony_ci				BLK_MQ_NO_HCTX_IDX);
51762306a36Sopenharmony_ci	} else {
51862306a36Sopenharmony_ci		queue_for_each_hw_ctx(q, hctx, i) {
51962306a36Sopenharmony_ci			if (hctx->sched_tags)
52062306a36Sopenharmony_ci				blk_mq_free_rqs(q->tag_set,
52162306a36Sopenharmony_ci						hctx->sched_tags, i);
52262306a36Sopenharmony_ci		}
52362306a36Sopenharmony_ci	}
52462306a36Sopenharmony_ci}
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_civoid blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
52762306a36Sopenharmony_ci{
52862306a36Sopenharmony_ci	struct blk_mq_hw_ctx *hctx;
52962306a36Sopenharmony_ci	unsigned long i;
53062306a36Sopenharmony_ci	unsigned int flags = 0;
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci	queue_for_each_hw_ctx(q, hctx, i) {
53362306a36Sopenharmony_ci		mutex_lock(&q->debugfs_mutex);
53462306a36Sopenharmony_ci		blk_mq_debugfs_unregister_sched_hctx(hctx);
53562306a36Sopenharmony_ci		mutex_unlock(&q->debugfs_mutex);
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci		if (e->type->ops.exit_hctx && hctx->sched_data) {
53862306a36Sopenharmony_ci			e->type->ops.exit_hctx(hctx, i);
53962306a36Sopenharmony_ci			hctx->sched_data = NULL;
54062306a36Sopenharmony_ci		}
54162306a36Sopenharmony_ci		flags = hctx->flags;
54262306a36Sopenharmony_ci	}
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci	mutex_lock(&q->debugfs_mutex);
54562306a36Sopenharmony_ci	blk_mq_debugfs_unregister_sched(q);
54662306a36Sopenharmony_ci	mutex_unlock(&q->debugfs_mutex);
54762306a36Sopenharmony_ci
54862306a36Sopenharmony_ci	if (e->type->ops.exit_sched)
54962306a36Sopenharmony_ci		e->type->ops.exit_sched(e);
55062306a36Sopenharmony_ci	blk_mq_sched_tags_teardown(q, flags);
55162306a36Sopenharmony_ci	q->elevator = NULL;
55262306a36Sopenharmony_ci}
553