162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Functions to sequence PREFLUSH and FUA writes.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2011		Max Planck Institute for Gravitational Physics
662306a36Sopenharmony_ci * Copyright (C) 2011		Tejun Heo <tj@kernel.org>
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three
962306a36Sopenharmony_ci * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
1062306a36Sopenharmony_ci * properties and hardware capability.
1162306a36Sopenharmony_ci *
1262306a36Sopenharmony_ci * If a request doesn't have data, only REQ_PREFLUSH makes sense, which
1362306a36Sopenharmony_ci * indicates a simple flush request.  If there is data, REQ_PREFLUSH indicates
1462306a36Sopenharmony_ci * that the device cache should be flushed before the data is executed, and
1562306a36Sopenharmony_ci * REQ_FUA means that the data must be on non-volatile media on request
1662306a36Sopenharmony_ci * completion.
1762306a36Sopenharmony_ci *
1862306a36Sopenharmony_ci * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any
1962306a36Sopenharmony_ci * difference.  The requests are either completed immediately if there's no data
2062306a36Sopenharmony_ci * or executed as normal requests otherwise.
2162306a36Sopenharmony_ci *
2262306a36Sopenharmony_ci * If the device has writeback cache and supports FUA, REQ_PREFLUSH is
2362306a36Sopenharmony_ci * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
2462306a36Sopenharmony_ci *
2562306a36Sopenharmony_ci * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH
2662306a36Sopenharmony_ci * is translated to PREFLUSH and REQ_FUA to POSTFLUSH.
2762306a36Sopenharmony_ci *
2862306a36Sopenharmony_ci * The actual execution of flush is double buffered.  Whenever a request
2962306a36Sopenharmony_ci * needs to execute PRE or POSTFLUSH, it queues at
3062306a36Sopenharmony_ci * fq->flush_queue[fq->flush_pending_idx].  Once certain criteria are met, a
3162306a36Sopenharmony_ci * REQ_OP_FLUSH is issued and the pending_idx is toggled.  When the flush
3262306a36Sopenharmony_ci * completes, all the requests which were pending are proceeded to the next
3362306a36Sopenharmony_ci * step.  This allows arbitrary merging of different types of PREFLUSH/FUA
3462306a36Sopenharmony_ci * requests.
3562306a36Sopenharmony_ci *
3662306a36Sopenharmony_ci * Currently, the following conditions are used to determine when to issue
3762306a36Sopenharmony_ci * flush.
3862306a36Sopenharmony_ci *
3962306a36Sopenharmony_ci * C1. At any given time, only one flush shall be in progress.  This makes
4062306a36Sopenharmony_ci *     double buffering sufficient.
4162306a36Sopenharmony_ci *
4262306a36Sopenharmony_ci * C2. Flush is deferred if any request is executing DATA of its sequence.
4362306a36Sopenharmony_ci *     This avoids issuing separate POSTFLUSHes for requests which shared
4462306a36Sopenharmony_ci *     PREFLUSH.
4562306a36Sopenharmony_ci *
4662306a36Sopenharmony_ci * C3. The second condition is ignored if there is a request which has
4762306a36Sopenharmony_ci *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
4862306a36Sopenharmony_ci *     starvation in the unlikely case where there are continuous stream of
4962306a36Sopenharmony_ci *     FUA (without PREFLUSH) requests.
5062306a36Sopenharmony_ci *
5162306a36Sopenharmony_ci * For devices which support FUA, it isn't clear whether C2 (and thus C3)
5262306a36Sopenharmony_ci * is beneficial.
5362306a36Sopenharmony_ci *
5462306a36Sopenharmony_ci * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice.
5562306a36Sopenharmony_ci * Once while executing DATA and again after the whole sequence is
5662306a36Sopenharmony_ci * complete.  The first completion updates the contained bio but doesn't
5762306a36Sopenharmony_ci * finish it so that the bio submitter is notified only after the whole
5862306a36Sopenharmony_ci * sequence is complete.  This is implemented by testing RQF_FLUSH_SEQ in
5962306a36Sopenharmony_ci * req_bio_endio().
6062306a36Sopenharmony_ci *
6162306a36Sopenharmony_ci * The above peculiarity requires that each PREFLUSH/FUA request has only one
6262306a36Sopenharmony_ci * bio attached to it, which is guaranteed as they aren't allowed to be
6362306a36Sopenharmony_ci * merged in the usual way.
6462306a36Sopenharmony_ci */
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_ci#include <linux/kernel.h>
6762306a36Sopenharmony_ci#include <linux/module.h>
6862306a36Sopenharmony_ci#include <linux/bio.h>
6962306a36Sopenharmony_ci#include <linux/blkdev.h>
7062306a36Sopenharmony_ci#include <linux/gfp.h>
7162306a36Sopenharmony_ci#include <linux/part_stat.h>
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci#include "blk.h"
7462306a36Sopenharmony_ci#include "blk-mq.h"
7562306a36Sopenharmony_ci#include "blk-mq-sched.h"
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci/* PREFLUSH/FUA sequences */
7862306a36Sopenharmony_cienum {
7962306a36Sopenharmony_ci	REQ_FSEQ_PREFLUSH	= (1 << 0), /* pre-flushing in progress */
8062306a36Sopenharmony_ci	REQ_FSEQ_DATA		= (1 << 1), /* data write in progress */
8162306a36Sopenharmony_ci	REQ_FSEQ_POSTFLUSH	= (1 << 2), /* post-flushing in progress */
8262306a36Sopenharmony_ci	REQ_FSEQ_DONE		= (1 << 3),
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	REQ_FSEQ_ACTIONS	= REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
8562306a36Sopenharmony_ci				  REQ_FSEQ_POSTFLUSH,
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci	/*
8862306a36Sopenharmony_ci	 * If flush has been pending longer than the following timeout,
8962306a36Sopenharmony_ci	 * it's issued even if flush_data requests are still in flight.
9062306a36Sopenharmony_ci	 */
9162306a36Sopenharmony_ci	FLUSH_PENDING_TIMEOUT	= 5 * HZ,
9262306a36Sopenharmony_ci};
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_cistatic void blk_kick_flush(struct request_queue *q,
9562306a36Sopenharmony_ci			   struct blk_flush_queue *fq, blk_opf_t flags);
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_cistatic inline struct blk_flush_queue *
9862306a36Sopenharmony_ciblk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
9962306a36Sopenharmony_ci{
10062306a36Sopenharmony_ci	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
10162306a36Sopenharmony_ci}
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_cistatic unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
10462306a36Sopenharmony_ci{
10562306a36Sopenharmony_ci	unsigned int policy = 0;
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci	if (blk_rq_sectors(rq))
10862306a36Sopenharmony_ci		policy |= REQ_FSEQ_DATA;
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci	if (fflags & (1UL << QUEUE_FLAG_WC)) {
11162306a36Sopenharmony_ci		if (rq->cmd_flags & REQ_PREFLUSH)
11262306a36Sopenharmony_ci			policy |= REQ_FSEQ_PREFLUSH;
11362306a36Sopenharmony_ci		if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
11462306a36Sopenharmony_ci		    (rq->cmd_flags & REQ_FUA))
11562306a36Sopenharmony_ci			policy |= REQ_FSEQ_POSTFLUSH;
11662306a36Sopenharmony_ci	}
11762306a36Sopenharmony_ci	return policy;
11862306a36Sopenharmony_ci}
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_cistatic unsigned int blk_flush_cur_seq(struct request *rq)
12162306a36Sopenharmony_ci{
12262306a36Sopenharmony_ci	return 1 << ffz(rq->flush.seq);
12362306a36Sopenharmony_ci}
12462306a36Sopenharmony_ci
12562306a36Sopenharmony_cistatic void blk_flush_restore_request(struct request *rq)
12662306a36Sopenharmony_ci{
12762306a36Sopenharmony_ci	/*
12862306a36Sopenharmony_ci	 * After flush data completion, @rq->bio is %NULL but we need to
12962306a36Sopenharmony_ci	 * complete the bio again.  @rq->biotail is guaranteed to equal the
13062306a36Sopenharmony_ci	 * original @rq->bio.  Restore it.
13162306a36Sopenharmony_ci	 */
13262306a36Sopenharmony_ci	rq->bio = rq->biotail;
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci	/* make @rq a normal request */
13562306a36Sopenharmony_ci	rq->rq_flags &= ~RQF_FLUSH_SEQ;
13662306a36Sopenharmony_ci	rq->end_io = rq->flush.saved_end_io;
13762306a36Sopenharmony_ci}
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_cistatic void blk_account_io_flush(struct request *rq)
14062306a36Sopenharmony_ci{
14162306a36Sopenharmony_ci	struct block_device *part = rq->q->disk->part0;
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ci	part_stat_lock();
14462306a36Sopenharmony_ci	part_stat_inc(part, ios[STAT_FLUSH]);
14562306a36Sopenharmony_ci	part_stat_add(part, nsecs[STAT_FLUSH],
14662306a36Sopenharmony_ci		      ktime_get_ns() - rq->start_time_ns);
14762306a36Sopenharmony_ci	part_stat_unlock();
14862306a36Sopenharmony_ci}
14962306a36Sopenharmony_ci
15062306a36Sopenharmony_ci/**
15162306a36Sopenharmony_ci * blk_flush_complete_seq - complete flush sequence
15262306a36Sopenharmony_ci * @rq: PREFLUSH/FUA request being sequenced
15362306a36Sopenharmony_ci * @fq: flush queue
15462306a36Sopenharmony_ci * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
15562306a36Sopenharmony_ci * @error: whether an error occurred
15662306a36Sopenharmony_ci *
15762306a36Sopenharmony_ci * @rq just completed @seq part of its flush sequence, record the
15862306a36Sopenharmony_ci * completion and trigger the next step.
15962306a36Sopenharmony_ci *
16062306a36Sopenharmony_ci * CONTEXT:
16162306a36Sopenharmony_ci * spin_lock_irq(fq->mq_flush_lock)
16262306a36Sopenharmony_ci */
16362306a36Sopenharmony_cistatic void blk_flush_complete_seq(struct request *rq,
16462306a36Sopenharmony_ci				   struct blk_flush_queue *fq,
16562306a36Sopenharmony_ci				   unsigned int seq, blk_status_t error)
16662306a36Sopenharmony_ci{
16762306a36Sopenharmony_ci	struct request_queue *q = rq->q;
16862306a36Sopenharmony_ci	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
16962306a36Sopenharmony_ci	blk_opf_t cmd_flags;
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci	BUG_ON(rq->flush.seq & seq);
17262306a36Sopenharmony_ci	rq->flush.seq |= seq;
17362306a36Sopenharmony_ci	cmd_flags = rq->cmd_flags;
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci	if (likely(!error))
17662306a36Sopenharmony_ci		seq = blk_flush_cur_seq(rq);
17762306a36Sopenharmony_ci	else
17862306a36Sopenharmony_ci		seq = REQ_FSEQ_DONE;
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ci	switch (seq) {
18162306a36Sopenharmony_ci	case REQ_FSEQ_PREFLUSH:
18262306a36Sopenharmony_ci	case REQ_FSEQ_POSTFLUSH:
18362306a36Sopenharmony_ci		/* queue for flush */
18462306a36Sopenharmony_ci		if (list_empty(pending))
18562306a36Sopenharmony_ci			fq->flush_pending_since = jiffies;
18662306a36Sopenharmony_ci		list_move_tail(&rq->queuelist, pending);
18762306a36Sopenharmony_ci		break;
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	case REQ_FSEQ_DATA:
19062306a36Sopenharmony_ci		fq->flush_data_in_flight++;
19162306a36Sopenharmony_ci		spin_lock(&q->requeue_lock);
19262306a36Sopenharmony_ci		list_move(&rq->queuelist, &q->requeue_list);
19362306a36Sopenharmony_ci		spin_unlock(&q->requeue_lock);
19462306a36Sopenharmony_ci		blk_mq_kick_requeue_list(q);
19562306a36Sopenharmony_ci		break;
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci	case REQ_FSEQ_DONE:
19862306a36Sopenharmony_ci		/*
19962306a36Sopenharmony_ci		 * @rq was previously adjusted by blk_insert_flush() for
20062306a36Sopenharmony_ci		 * flush sequencing and may already have gone through the
20162306a36Sopenharmony_ci		 * flush data request completion path.  Restore @rq for
20262306a36Sopenharmony_ci		 * normal completion and end it.
20362306a36Sopenharmony_ci		 */
20462306a36Sopenharmony_ci		list_del_init(&rq->queuelist);
20562306a36Sopenharmony_ci		blk_flush_restore_request(rq);
20662306a36Sopenharmony_ci		blk_mq_end_request(rq, error);
20762306a36Sopenharmony_ci		break;
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci	default:
21062306a36Sopenharmony_ci		BUG();
21162306a36Sopenharmony_ci	}
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci	blk_kick_flush(q, fq, cmd_flags);
21462306a36Sopenharmony_ci}
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_cistatic enum rq_end_io_ret flush_end_io(struct request *flush_rq,
21762306a36Sopenharmony_ci				       blk_status_t error)
21862306a36Sopenharmony_ci{
21962306a36Sopenharmony_ci	struct request_queue *q = flush_rq->q;
22062306a36Sopenharmony_ci	struct list_head *running;
22162306a36Sopenharmony_ci	struct request *rq, *n;
22262306a36Sopenharmony_ci	unsigned long flags = 0;
22362306a36Sopenharmony_ci	struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_ci	/* release the tag's ownership to the req cloned from */
22662306a36Sopenharmony_ci	spin_lock_irqsave(&fq->mq_flush_lock, flags);
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci	if (!req_ref_put_and_test(flush_rq)) {
22962306a36Sopenharmony_ci		fq->rq_status = error;
23062306a36Sopenharmony_ci		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
23162306a36Sopenharmony_ci		return RQ_END_IO_NONE;
23262306a36Sopenharmony_ci	}
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci	blk_account_io_flush(flush_rq);
23562306a36Sopenharmony_ci	/*
23662306a36Sopenharmony_ci	 * Flush request has to be marked as IDLE when it is really ended
23762306a36Sopenharmony_ci	 * because its .end_io() is called from timeout code path too for
23862306a36Sopenharmony_ci	 * avoiding use-after-free.
23962306a36Sopenharmony_ci	 */
24062306a36Sopenharmony_ci	WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
24162306a36Sopenharmony_ci	if (fq->rq_status != BLK_STS_OK) {
24262306a36Sopenharmony_ci		error = fq->rq_status;
24362306a36Sopenharmony_ci		fq->rq_status = BLK_STS_OK;
24462306a36Sopenharmony_ci	}
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci	if (!q->elevator) {
24762306a36Sopenharmony_ci		flush_rq->tag = BLK_MQ_NO_TAG;
24862306a36Sopenharmony_ci	} else {
24962306a36Sopenharmony_ci		blk_mq_put_driver_tag(flush_rq);
25062306a36Sopenharmony_ci		flush_rq->internal_tag = BLK_MQ_NO_TAG;
25162306a36Sopenharmony_ci	}
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci	running = &fq->flush_queue[fq->flush_running_idx];
25462306a36Sopenharmony_ci	BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci	/* account completion of the flush request */
25762306a36Sopenharmony_ci	fq->flush_running_idx ^= 1;
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	/* and push the waiting requests to the next stage */
26062306a36Sopenharmony_ci	list_for_each_entry_safe(rq, n, running, queuelist) {
26162306a36Sopenharmony_ci		unsigned int seq = blk_flush_cur_seq(rq);
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci		BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
26462306a36Sopenharmony_ci		blk_flush_complete_seq(rq, fq, seq, error);
26562306a36Sopenharmony_ci	}
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
26862306a36Sopenharmony_ci	return RQ_END_IO_NONE;
26962306a36Sopenharmony_ci}
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_cibool is_flush_rq(struct request *rq)
27262306a36Sopenharmony_ci{
27362306a36Sopenharmony_ci	return rq->end_io == flush_end_io;
27462306a36Sopenharmony_ci}
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci/**
27762306a36Sopenharmony_ci * blk_kick_flush - consider issuing flush request
27862306a36Sopenharmony_ci * @q: request_queue being kicked
27962306a36Sopenharmony_ci * @fq: flush queue
28062306a36Sopenharmony_ci * @flags: cmd_flags of the original request
28162306a36Sopenharmony_ci *
28262306a36Sopenharmony_ci * Flush related states of @q have changed, consider issuing flush request.
28362306a36Sopenharmony_ci * Please read the comment at the top of this file for more info.
28462306a36Sopenharmony_ci *
28562306a36Sopenharmony_ci * CONTEXT:
28662306a36Sopenharmony_ci * spin_lock_irq(fq->mq_flush_lock)
28762306a36Sopenharmony_ci *
28862306a36Sopenharmony_ci */
28962306a36Sopenharmony_cistatic void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
29062306a36Sopenharmony_ci			   blk_opf_t flags)
29162306a36Sopenharmony_ci{
29262306a36Sopenharmony_ci	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
29362306a36Sopenharmony_ci	struct request *first_rq =
29462306a36Sopenharmony_ci		list_first_entry(pending, struct request, queuelist);
29562306a36Sopenharmony_ci	struct request *flush_rq = fq->flush_rq;
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	/* C1 described at the top of this file */
29862306a36Sopenharmony_ci	if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
29962306a36Sopenharmony_ci		return;
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci	/* C2 and C3 */
30262306a36Sopenharmony_ci	if (fq->flush_data_in_flight &&
30362306a36Sopenharmony_ci	    time_before(jiffies,
30462306a36Sopenharmony_ci			fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
30562306a36Sopenharmony_ci		return;
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci	/*
30862306a36Sopenharmony_ci	 * Issue flush and toggle pending_idx.  This makes pending_idx
30962306a36Sopenharmony_ci	 * different from running_idx, which means flush is in flight.
31062306a36Sopenharmony_ci	 */
31162306a36Sopenharmony_ci	fq->flush_pending_idx ^= 1;
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	blk_rq_init(q, flush_rq);
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_ci	/*
31662306a36Sopenharmony_ci	 * In case of none scheduler, borrow tag from the first request
31762306a36Sopenharmony_ci	 * since they can't be in flight at the same time. And acquire
31862306a36Sopenharmony_ci	 * the tag's ownership for flush req.
31962306a36Sopenharmony_ci	 *
32062306a36Sopenharmony_ci	 * In case of IO scheduler, flush rq need to borrow scheduler tag
32162306a36Sopenharmony_ci	 * just for cheating put/get driver tag.
32262306a36Sopenharmony_ci	 */
32362306a36Sopenharmony_ci	flush_rq->mq_ctx = first_rq->mq_ctx;
32462306a36Sopenharmony_ci	flush_rq->mq_hctx = first_rq->mq_hctx;
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci	if (!q->elevator) {
32762306a36Sopenharmony_ci		flush_rq->tag = first_rq->tag;
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci		/*
33062306a36Sopenharmony_ci		 * We borrow data request's driver tag, so have to mark
33162306a36Sopenharmony_ci		 * this flush request as INFLIGHT for avoiding double
33262306a36Sopenharmony_ci		 * account of this driver tag
33362306a36Sopenharmony_ci		 */
33462306a36Sopenharmony_ci		flush_rq->rq_flags |= RQF_MQ_INFLIGHT;
33562306a36Sopenharmony_ci	} else
33662306a36Sopenharmony_ci		flush_rq->internal_tag = first_rq->internal_tag;
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
33962306a36Sopenharmony_ci	flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
34062306a36Sopenharmony_ci	flush_rq->rq_flags |= RQF_FLUSH_SEQ;
34162306a36Sopenharmony_ci	flush_rq->end_io = flush_end_io;
34262306a36Sopenharmony_ci	/*
34362306a36Sopenharmony_ci	 * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
34462306a36Sopenharmony_ci	 * implied in refcount_inc_not_zero() called from
34562306a36Sopenharmony_ci	 * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref
34662306a36Sopenharmony_ci	 * and READ flush_rq->end_io
34762306a36Sopenharmony_ci	 */
34862306a36Sopenharmony_ci	smp_wmb();
34962306a36Sopenharmony_ci	req_ref_set(flush_rq, 1);
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci	spin_lock(&q->requeue_lock);
35262306a36Sopenharmony_ci	list_add_tail(&flush_rq->queuelist, &q->flush_list);
35362306a36Sopenharmony_ci	spin_unlock(&q->requeue_lock);
35462306a36Sopenharmony_ci
35562306a36Sopenharmony_ci	blk_mq_kick_requeue_list(q);
35662306a36Sopenharmony_ci}
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_cistatic enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
35962306a36Sopenharmony_ci					       blk_status_t error)
36062306a36Sopenharmony_ci{
36162306a36Sopenharmony_ci	struct request_queue *q = rq->q;
36262306a36Sopenharmony_ci	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
36362306a36Sopenharmony_ci	struct blk_mq_ctx *ctx = rq->mq_ctx;
36462306a36Sopenharmony_ci	unsigned long flags;
36562306a36Sopenharmony_ci	struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci	if (q->elevator) {
36862306a36Sopenharmony_ci		WARN_ON(rq->tag < 0);
36962306a36Sopenharmony_ci		blk_mq_put_driver_tag(rq);
37062306a36Sopenharmony_ci	}
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	/*
37362306a36Sopenharmony_ci	 * After populating an empty queue, kick it to avoid stall.  Read
37462306a36Sopenharmony_ci	 * the comment in flush_end_io().
37562306a36Sopenharmony_ci	 */
37662306a36Sopenharmony_ci	spin_lock_irqsave(&fq->mq_flush_lock, flags);
37762306a36Sopenharmony_ci	fq->flush_data_in_flight--;
37862306a36Sopenharmony_ci	/*
37962306a36Sopenharmony_ci	 * May have been corrupted by rq->rq_next reuse, we need to
38062306a36Sopenharmony_ci	 * re-initialize rq->queuelist before reusing it here.
38162306a36Sopenharmony_ci	 */
38262306a36Sopenharmony_ci	INIT_LIST_HEAD(&rq->queuelist);
38362306a36Sopenharmony_ci	blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
38462306a36Sopenharmony_ci	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	blk_mq_sched_restart(hctx);
38762306a36Sopenharmony_ci	return RQ_END_IO_NONE;
38862306a36Sopenharmony_ci}
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_cistatic void blk_rq_init_flush(struct request *rq)
39162306a36Sopenharmony_ci{
39262306a36Sopenharmony_ci	rq->flush.seq = 0;
39362306a36Sopenharmony_ci	rq->rq_flags |= RQF_FLUSH_SEQ;
39462306a36Sopenharmony_ci	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
39562306a36Sopenharmony_ci	rq->end_io = mq_flush_data_end_io;
39662306a36Sopenharmony_ci}
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_ci/*
39962306a36Sopenharmony_ci * Insert a PREFLUSH/FUA request into the flush state machine.
40062306a36Sopenharmony_ci * Returns true if the request has been consumed by the flush state machine,
40162306a36Sopenharmony_ci * or false if the caller should continue to process it.
40262306a36Sopenharmony_ci */
40362306a36Sopenharmony_cibool blk_insert_flush(struct request *rq)
40462306a36Sopenharmony_ci{
40562306a36Sopenharmony_ci	struct request_queue *q = rq->q;
40662306a36Sopenharmony_ci	unsigned long fflags = q->queue_flags;	/* may change, cache */
40762306a36Sopenharmony_ci	unsigned int policy = blk_flush_policy(fflags, rq);
40862306a36Sopenharmony_ci	struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci	/* FLUSH/FUA request must never be merged */
41162306a36Sopenharmony_ci	WARN_ON_ONCE(rq->bio != rq->biotail);
41262306a36Sopenharmony_ci
41362306a36Sopenharmony_ci	/*
41462306a36Sopenharmony_ci	 * @policy now records what operations need to be done.  Adjust
41562306a36Sopenharmony_ci	 * REQ_PREFLUSH and FUA for the driver.
41662306a36Sopenharmony_ci	 */
41762306a36Sopenharmony_ci	rq->cmd_flags &= ~REQ_PREFLUSH;
41862306a36Sopenharmony_ci	if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
41962306a36Sopenharmony_ci		rq->cmd_flags &= ~REQ_FUA;
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci	/*
42262306a36Sopenharmony_ci	 * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any
42362306a36Sopenharmony_ci	 * of those flags, we have to set REQ_SYNC to avoid skewing
42462306a36Sopenharmony_ci	 * the request accounting.
42562306a36Sopenharmony_ci	 */
42662306a36Sopenharmony_ci	rq->cmd_flags |= REQ_SYNC;
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci	switch (policy) {
42962306a36Sopenharmony_ci	case 0:
43062306a36Sopenharmony_ci		/*
43162306a36Sopenharmony_ci		 * An empty flush handed down from a stacking driver may
43262306a36Sopenharmony_ci		 * translate into nothing if the underlying device does not
43362306a36Sopenharmony_ci		 * advertise a write-back cache.  In this case, simply
43462306a36Sopenharmony_ci		 * complete the request.
43562306a36Sopenharmony_ci		 */
43662306a36Sopenharmony_ci		blk_mq_end_request(rq, 0);
43762306a36Sopenharmony_ci		return true;
43862306a36Sopenharmony_ci	case REQ_FSEQ_DATA:
43962306a36Sopenharmony_ci		/*
44062306a36Sopenharmony_ci		 * If there's data, but no flush is necessary, the request can
44162306a36Sopenharmony_ci		 * be processed directly without going through flush machinery.
44262306a36Sopenharmony_ci		 * Queue for normal execution.
44362306a36Sopenharmony_ci		 */
44462306a36Sopenharmony_ci		return false;
44562306a36Sopenharmony_ci	case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH:
44662306a36Sopenharmony_ci		/*
44762306a36Sopenharmony_ci		 * Initialize the flush fields and completion handler to trigger
44862306a36Sopenharmony_ci		 * the post flush, and then just pass the command on.
44962306a36Sopenharmony_ci		 */
45062306a36Sopenharmony_ci		blk_rq_init_flush(rq);
45162306a36Sopenharmony_ci		rq->flush.seq |= REQ_FSEQ_PREFLUSH;
45262306a36Sopenharmony_ci		spin_lock_irq(&fq->mq_flush_lock);
45362306a36Sopenharmony_ci		fq->flush_data_in_flight++;
45462306a36Sopenharmony_ci		spin_unlock_irq(&fq->mq_flush_lock);
45562306a36Sopenharmony_ci		return false;
45662306a36Sopenharmony_ci	default:
45762306a36Sopenharmony_ci		/*
45862306a36Sopenharmony_ci		 * Mark the request as part of a flush sequence and submit it
45962306a36Sopenharmony_ci		 * for further processing to the flush state machine.
46062306a36Sopenharmony_ci		 */
46162306a36Sopenharmony_ci		blk_rq_init_flush(rq);
46262306a36Sopenharmony_ci		spin_lock_irq(&fq->mq_flush_lock);
46362306a36Sopenharmony_ci		blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
46462306a36Sopenharmony_ci		spin_unlock_irq(&fq->mq_flush_lock);
46562306a36Sopenharmony_ci		return true;
46662306a36Sopenharmony_ci	}
46762306a36Sopenharmony_ci}
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci/**
47062306a36Sopenharmony_ci * blkdev_issue_flush - queue a flush
47162306a36Sopenharmony_ci * @bdev:	blockdev to issue flush for
47262306a36Sopenharmony_ci *
47362306a36Sopenharmony_ci * Description:
47462306a36Sopenharmony_ci *    Issue a flush for the block device in question.
47562306a36Sopenharmony_ci */
47662306a36Sopenharmony_ciint blkdev_issue_flush(struct block_device *bdev)
47762306a36Sopenharmony_ci{
47862306a36Sopenharmony_ci	struct bio bio;
47962306a36Sopenharmony_ci
48062306a36Sopenharmony_ci	bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
48162306a36Sopenharmony_ci	return submit_bio_wait(&bio);
48262306a36Sopenharmony_ci}
48362306a36Sopenharmony_ciEXPORT_SYMBOL(blkdev_issue_flush);
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_cistruct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
48662306a36Sopenharmony_ci					      gfp_t flags)
48762306a36Sopenharmony_ci{
48862306a36Sopenharmony_ci	struct blk_flush_queue *fq;
48962306a36Sopenharmony_ci	int rq_sz = sizeof(struct request);
49062306a36Sopenharmony_ci
49162306a36Sopenharmony_ci	fq = kzalloc_node(sizeof(*fq), flags, node);
49262306a36Sopenharmony_ci	if (!fq)
49362306a36Sopenharmony_ci		goto fail;
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_ci	spin_lock_init(&fq->mq_flush_lock);
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_ci	rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
49862306a36Sopenharmony_ci	fq->flush_rq = kzalloc_node(rq_sz, flags, node);
49962306a36Sopenharmony_ci	if (!fq->flush_rq)
50062306a36Sopenharmony_ci		goto fail_rq;
50162306a36Sopenharmony_ci
50262306a36Sopenharmony_ci	INIT_LIST_HEAD(&fq->flush_queue[0]);
50362306a36Sopenharmony_ci	INIT_LIST_HEAD(&fq->flush_queue[1]);
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci	return fq;
50662306a36Sopenharmony_ci
50762306a36Sopenharmony_ci fail_rq:
50862306a36Sopenharmony_ci	kfree(fq);
50962306a36Sopenharmony_ci fail:
51062306a36Sopenharmony_ci	return NULL;
51162306a36Sopenharmony_ci}
51262306a36Sopenharmony_ci
51362306a36Sopenharmony_civoid blk_free_flush_queue(struct blk_flush_queue *fq)
51462306a36Sopenharmony_ci{
51562306a36Sopenharmony_ci	/* bio based request queue hasn't flush queue */
51662306a36Sopenharmony_ci	if (!fq)
51762306a36Sopenharmony_ci		return;
51862306a36Sopenharmony_ci
51962306a36Sopenharmony_ci	kfree(fq->flush_rq);
52062306a36Sopenharmony_ci	kfree(fq);
52162306a36Sopenharmony_ci}
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci/*
52462306a36Sopenharmony_ci * Allow driver to set its own lock class to fq->mq_flush_lock for
52562306a36Sopenharmony_ci * avoiding lockdep complaint.
52662306a36Sopenharmony_ci *
52762306a36Sopenharmony_ci * flush_end_io() may be called recursively from some driver, such as
52862306a36Sopenharmony_ci * nvme-loop, so lockdep may complain 'possible recursive locking' because
52962306a36Sopenharmony_ci * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
53062306a36Sopenharmony_ci * key. We need to assign different lock class for these driver's
53162306a36Sopenharmony_ci * fq->mq_flush_lock for avoiding the lockdep warning.
53262306a36Sopenharmony_ci *
53362306a36Sopenharmony_ci * Use dynamically allocated lock class key for each 'blk_flush_queue'
53462306a36Sopenharmony_ci * instance is over-kill, and more worse it introduces horrible boot delay
53562306a36Sopenharmony_ci * issue because synchronize_rcu() is implied in lockdep_unregister_key which
53662306a36Sopenharmony_ci * is called for each hctx release. SCSI probing may synchronously create and
53762306a36Sopenharmony_ci * destroy lots of MQ request_queues for non-existent devices, and some robot
53862306a36Sopenharmony_ci * test kernel always enable lockdep option. It is observed that more than half
53962306a36Sopenharmony_ci * an hour is taken during SCSI MQ probe with per-fq lock class.
54062306a36Sopenharmony_ci */
54162306a36Sopenharmony_civoid blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
54262306a36Sopenharmony_ci		struct lock_class_key *key)
54362306a36Sopenharmony_ci{
54462306a36Sopenharmony_ci	lockdep_set_class(&hctx->fq->mq_flush_lock, key);
54562306a36Sopenharmony_ci}
54662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);
547