162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Functions to sequence PREFLUSH and FUA writes. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2011 Max Planck Institute for Gravitational Physics 662306a36Sopenharmony_ci * Copyright (C) 2011 Tejun Heo <tj@kernel.org> 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three 962306a36Sopenharmony_ci * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request 1062306a36Sopenharmony_ci * properties and hardware capability. 1162306a36Sopenharmony_ci * 1262306a36Sopenharmony_ci * If a request doesn't have data, only REQ_PREFLUSH makes sense, which 1362306a36Sopenharmony_ci * indicates a simple flush request. If there is data, REQ_PREFLUSH indicates 1462306a36Sopenharmony_ci * that the device cache should be flushed before the data is executed, and 1562306a36Sopenharmony_ci * REQ_FUA means that the data must be on non-volatile media on request 1662306a36Sopenharmony_ci * completion. 1762306a36Sopenharmony_ci * 1862306a36Sopenharmony_ci * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any 1962306a36Sopenharmony_ci * difference. The requests are either completed immediately if there's no data 2062306a36Sopenharmony_ci * or executed as normal requests otherwise. 2162306a36Sopenharmony_ci * 2262306a36Sopenharmony_ci * If the device has writeback cache and supports FUA, REQ_PREFLUSH is 2362306a36Sopenharmony_ci * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. 2462306a36Sopenharmony_ci * 2562306a36Sopenharmony_ci * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH 2662306a36Sopenharmony_ci * is translated to PREFLUSH and REQ_FUA to POSTFLUSH. 2762306a36Sopenharmony_ci * 2862306a36Sopenharmony_ci * The actual execution of flush is double buffered. Whenever a request 2962306a36Sopenharmony_ci * needs to execute PRE or POSTFLUSH, it queues at 3062306a36Sopenharmony_ci * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a 3162306a36Sopenharmony_ci * REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush 3262306a36Sopenharmony_ci * completes, all the requests which were pending are proceeded to the next 3362306a36Sopenharmony_ci * step. This allows arbitrary merging of different types of PREFLUSH/FUA 3462306a36Sopenharmony_ci * requests. 3562306a36Sopenharmony_ci * 3662306a36Sopenharmony_ci * Currently, the following conditions are used to determine when to issue 3762306a36Sopenharmony_ci * flush. 3862306a36Sopenharmony_ci * 3962306a36Sopenharmony_ci * C1. At any given time, only one flush shall be in progress. This makes 4062306a36Sopenharmony_ci * double buffering sufficient. 4162306a36Sopenharmony_ci * 4262306a36Sopenharmony_ci * C2. Flush is deferred if any request is executing DATA of its sequence. 4362306a36Sopenharmony_ci * This avoids issuing separate POSTFLUSHes for requests which shared 4462306a36Sopenharmony_ci * PREFLUSH. 4562306a36Sopenharmony_ci * 4662306a36Sopenharmony_ci * C3. The second condition is ignored if there is a request which has 4762306a36Sopenharmony_ci * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid 4862306a36Sopenharmony_ci * starvation in the unlikely case where there are continuous stream of 4962306a36Sopenharmony_ci * FUA (without PREFLUSH) requests. 5062306a36Sopenharmony_ci * 5162306a36Sopenharmony_ci * For devices which support FUA, it isn't clear whether C2 (and thus C3) 5262306a36Sopenharmony_ci * is beneficial. 5362306a36Sopenharmony_ci * 5462306a36Sopenharmony_ci * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice. 5562306a36Sopenharmony_ci * Once while executing DATA and again after the whole sequence is 5662306a36Sopenharmony_ci * complete. The first completion updates the contained bio but doesn't 5762306a36Sopenharmony_ci * finish it so that the bio submitter is notified only after the whole 5862306a36Sopenharmony_ci * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in 5962306a36Sopenharmony_ci * req_bio_endio(). 6062306a36Sopenharmony_ci * 6162306a36Sopenharmony_ci * The above peculiarity requires that each PREFLUSH/FUA request has only one 6262306a36Sopenharmony_ci * bio attached to it, which is guaranteed as they aren't allowed to be 6362306a36Sopenharmony_ci * merged in the usual way. 6462306a36Sopenharmony_ci */ 6562306a36Sopenharmony_ci 6662306a36Sopenharmony_ci#include <linux/kernel.h> 6762306a36Sopenharmony_ci#include <linux/module.h> 6862306a36Sopenharmony_ci#include <linux/bio.h> 6962306a36Sopenharmony_ci#include <linux/blkdev.h> 7062306a36Sopenharmony_ci#include <linux/gfp.h> 7162306a36Sopenharmony_ci#include <linux/part_stat.h> 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci#include "blk.h" 7462306a36Sopenharmony_ci#include "blk-mq.h" 7562306a36Sopenharmony_ci#include "blk-mq-sched.h" 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci/* PREFLUSH/FUA sequences */ 7862306a36Sopenharmony_cienum { 7962306a36Sopenharmony_ci REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ 8062306a36Sopenharmony_ci REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ 8162306a36Sopenharmony_ci REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ 8262306a36Sopenharmony_ci REQ_FSEQ_DONE = (1 << 3), 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | 8562306a36Sopenharmony_ci REQ_FSEQ_POSTFLUSH, 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci /* 8862306a36Sopenharmony_ci * If flush has been pending longer than the following timeout, 8962306a36Sopenharmony_ci * it's issued even if flush_data requests are still in flight. 9062306a36Sopenharmony_ci */ 9162306a36Sopenharmony_ci FLUSH_PENDING_TIMEOUT = 5 * HZ, 9262306a36Sopenharmony_ci}; 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_cistatic void blk_kick_flush(struct request_queue *q, 9562306a36Sopenharmony_ci struct blk_flush_queue *fq, blk_opf_t flags); 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_cistatic inline struct blk_flush_queue * 9862306a36Sopenharmony_ciblk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) 9962306a36Sopenharmony_ci{ 10062306a36Sopenharmony_ci return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; 10162306a36Sopenharmony_ci} 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_cistatic unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) 10462306a36Sopenharmony_ci{ 10562306a36Sopenharmony_ci unsigned int policy = 0; 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci if (blk_rq_sectors(rq)) 10862306a36Sopenharmony_ci policy |= REQ_FSEQ_DATA; 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci if (fflags & (1UL << QUEUE_FLAG_WC)) { 11162306a36Sopenharmony_ci if (rq->cmd_flags & REQ_PREFLUSH) 11262306a36Sopenharmony_ci policy |= REQ_FSEQ_PREFLUSH; 11362306a36Sopenharmony_ci if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && 11462306a36Sopenharmony_ci (rq->cmd_flags & REQ_FUA)) 11562306a36Sopenharmony_ci policy |= REQ_FSEQ_POSTFLUSH; 11662306a36Sopenharmony_ci } 11762306a36Sopenharmony_ci return policy; 11862306a36Sopenharmony_ci} 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_cistatic unsigned int blk_flush_cur_seq(struct request *rq) 12162306a36Sopenharmony_ci{ 12262306a36Sopenharmony_ci return 1 << ffz(rq->flush.seq); 12362306a36Sopenharmony_ci} 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_cistatic void blk_flush_restore_request(struct request *rq) 12662306a36Sopenharmony_ci{ 12762306a36Sopenharmony_ci /* 12862306a36Sopenharmony_ci * After flush data completion, @rq->bio is %NULL but we need to 12962306a36Sopenharmony_ci * complete the bio again. @rq->biotail is guaranteed to equal the 13062306a36Sopenharmony_ci * original @rq->bio. Restore it. 13162306a36Sopenharmony_ci */ 13262306a36Sopenharmony_ci rq->bio = rq->biotail; 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci /* make @rq a normal request */ 13562306a36Sopenharmony_ci rq->rq_flags &= ~RQF_FLUSH_SEQ; 13662306a36Sopenharmony_ci rq->end_io = rq->flush.saved_end_io; 13762306a36Sopenharmony_ci} 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_cistatic void blk_account_io_flush(struct request *rq) 14062306a36Sopenharmony_ci{ 14162306a36Sopenharmony_ci struct block_device *part = rq->q->disk->part0; 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci part_stat_lock(); 14462306a36Sopenharmony_ci part_stat_inc(part, ios[STAT_FLUSH]); 14562306a36Sopenharmony_ci part_stat_add(part, nsecs[STAT_FLUSH], 14662306a36Sopenharmony_ci ktime_get_ns() - rq->start_time_ns); 14762306a36Sopenharmony_ci part_stat_unlock(); 14862306a36Sopenharmony_ci} 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_ci/** 15162306a36Sopenharmony_ci * blk_flush_complete_seq - complete flush sequence 15262306a36Sopenharmony_ci * @rq: PREFLUSH/FUA request being sequenced 15362306a36Sopenharmony_ci * @fq: flush queue 15462306a36Sopenharmony_ci * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) 15562306a36Sopenharmony_ci * @error: whether an error occurred 15662306a36Sopenharmony_ci * 15762306a36Sopenharmony_ci * @rq just completed @seq part of its flush sequence, record the 15862306a36Sopenharmony_ci * completion and trigger the next step. 15962306a36Sopenharmony_ci * 16062306a36Sopenharmony_ci * CONTEXT: 16162306a36Sopenharmony_ci * spin_lock_irq(fq->mq_flush_lock) 16262306a36Sopenharmony_ci */ 16362306a36Sopenharmony_cistatic void blk_flush_complete_seq(struct request *rq, 16462306a36Sopenharmony_ci struct blk_flush_queue *fq, 16562306a36Sopenharmony_ci unsigned int seq, blk_status_t error) 16662306a36Sopenharmony_ci{ 16762306a36Sopenharmony_ci struct request_queue *q = rq->q; 16862306a36Sopenharmony_ci struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 16962306a36Sopenharmony_ci blk_opf_t cmd_flags; 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci BUG_ON(rq->flush.seq & seq); 17262306a36Sopenharmony_ci rq->flush.seq |= seq; 17362306a36Sopenharmony_ci cmd_flags = rq->cmd_flags; 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci if (likely(!error)) 17662306a36Sopenharmony_ci seq = blk_flush_cur_seq(rq); 17762306a36Sopenharmony_ci else 17862306a36Sopenharmony_ci seq = REQ_FSEQ_DONE; 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci switch (seq) { 18162306a36Sopenharmony_ci case REQ_FSEQ_PREFLUSH: 18262306a36Sopenharmony_ci case REQ_FSEQ_POSTFLUSH: 18362306a36Sopenharmony_ci /* queue for flush */ 18462306a36Sopenharmony_ci if (list_empty(pending)) 18562306a36Sopenharmony_ci fq->flush_pending_since = jiffies; 18662306a36Sopenharmony_ci list_move_tail(&rq->queuelist, pending); 18762306a36Sopenharmony_ci break; 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci case REQ_FSEQ_DATA: 19062306a36Sopenharmony_ci fq->flush_data_in_flight++; 19162306a36Sopenharmony_ci spin_lock(&q->requeue_lock); 19262306a36Sopenharmony_ci list_move(&rq->queuelist, &q->requeue_list); 19362306a36Sopenharmony_ci spin_unlock(&q->requeue_lock); 19462306a36Sopenharmony_ci blk_mq_kick_requeue_list(q); 19562306a36Sopenharmony_ci break; 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci case REQ_FSEQ_DONE: 19862306a36Sopenharmony_ci /* 19962306a36Sopenharmony_ci * @rq was previously adjusted by blk_insert_flush() for 20062306a36Sopenharmony_ci * flush sequencing and may already have gone through the 20162306a36Sopenharmony_ci * flush data request completion path. Restore @rq for 20262306a36Sopenharmony_ci * normal completion and end it. 20362306a36Sopenharmony_ci */ 20462306a36Sopenharmony_ci list_del_init(&rq->queuelist); 20562306a36Sopenharmony_ci blk_flush_restore_request(rq); 20662306a36Sopenharmony_ci blk_mq_end_request(rq, error); 20762306a36Sopenharmony_ci break; 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci default: 21062306a36Sopenharmony_ci BUG(); 21162306a36Sopenharmony_ci } 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci blk_kick_flush(q, fq, cmd_flags); 21462306a36Sopenharmony_ci} 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_cistatic enum rq_end_io_ret flush_end_io(struct request *flush_rq, 21762306a36Sopenharmony_ci blk_status_t error) 21862306a36Sopenharmony_ci{ 21962306a36Sopenharmony_ci struct request_queue *q = flush_rq->q; 22062306a36Sopenharmony_ci struct list_head *running; 22162306a36Sopenharmony_ci struct request *rq, *n; 22262306a36Sopenharmony_ci unsigned long flags = 0; 22362306a36Sopenharmony_ci struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci /* release the tag's ownership to the req cloned from */ 22662306a36Sopenharmony_ci spin_lock_irqsave(&fq->mq_flush_lock, flags); 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci if (!req_ref_put_and_test(flush_rq)) { 22962306a36Sopenharmony_ci fq->rq_status = error; 23062306a36Sopenharmony_ci spin_unlock_irqrestore(&fq->mq_flush_lock, flags); 23162306a36Sopenharmony_ci return RQ_END_IO_NONE; 23262306a36Sopenharmony_ci } 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci blk_account_io_flush(flush_rq); 23562306a36Sopenharmony_ci /* 23662306a36Sopenharmony_ci * Flush request has to be marked as IDLE when it is really ended 23762306a36Sopenharmony_ci * because its .end_io() is called from timeout code path too for 23862306a36Sopenharmony_ci * avoiding use-after-free. 23962306a36Sopenharmony_ci */ 24062306a36Sopenharmony_ci WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); 24162306a36Sopenharmony_ci if (fq->rq_status != BLK_STS_OK) { 24262306a36Sopenharmony_ci error = fq->rq_status; 24362306a36Sopenharmony_ci fq->rq_status = BLK_STS_OK; 24462306a36Sopenharmony_ci } 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci if (!q->elevator) { 24762306a36Sopenharmony_ci flush_rq->tag = BLK_MQ_NO_TAG; 24862306a36Sopenharmony_ci } else { 24962306a36Sopenharmony_ci blk_mq_put_driver_tag(flush_rq); 25062306a36Sopenharmony_ci flush_rq->internal_tag = BLK_MQ_NO_TAG; 25162306a36Sopenharmony_ci } 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci running = &fq->flush_queue[fq->flush_running_idx]; 25462306a36Sopenharmony_ci BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci /* account completion of the flush request */ 25762306a36Sopenharmony_ci fq->flush_running_idx ^= 1; 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci /* and push the waiting requests to the next stage */ 26062306a36Sopenharmony_ci list_for_each_entry_safe(rq, n, running, queuelist) { 26162306a36Sopenharmony_ci unsigned int seq = blk_flush_cur_seq(rq); 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); 26462306a36Sopenharmony_ci blk_flush_complete_seq(rq, fq, seq, error); 26562306a36Sopenharmony_ci } 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci spin_unlock_irqrestore(&fq->mq_flush_lock, flags); 26862306a36Sopenharmony_ci return RQ_END_IO_NONE; 26962306a36Sopenharmony_ci} 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_cibool is_flush_rq(struct request *rq) 27262306a36Sopenharmony_ci{ 27362306a36Sopenharmony_ci return rq->end_io == flush_end_io; 27462306a36Sopenharmony_ci} 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci/** 27762306a36Sopenharmony_ci * blk_kick_flush - consider issuing flush request 27862306a36Sopenharmony_ci * @q: request_queue being kicked 27962306a36Sopenharmony_ci * @fq: flush queue 28062306a36Sopenharmony_ci * @flags: cmd_flags of the original request 28162306a36Sopenharmony_ci * 28262306a36Sopenharmony_ci * Flush related states of @q have changed, consider issuing flush request. 28362306a36Sopenharmony_ci * Please read the comment at the top of this file for more info. 28462306a36Sopenharmony_ci * 28562306a36Sopenharmony_ci * CONTEXT: 28662306a36Sopenharmony_ci * spin_lock_irq(fq->mq_flush_lock) 28762306a36Sopenharmony_ci * 28862306a36Sopenharmony_ci */ 28962306a36Sopenharmony_cistatic void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, 29062306a36Sopenharmony_ci blk_opf_t flags) 29162306a36Sopenharmony_ci{ 29262306a36Sopenharmony_ci struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 29362306a36Sopenharmony_ci struct request *first_rq = 29462306a36Sopenharmony_ci list_first_entry(pending, struct request, queuelist); 29562306a36Sopenharmony_ci struct request *flush_rq = fq->flush_rq; 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci /* C1 described at the top of this file */ 29862306a36Sopenharmony_ci if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) 29962306a36Sopenharmony_ci return; 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci /* C2 and C3 */ 30262306a36Sopenharmony_ci if (fq->flush_data_in_flight && 30362306a36Sopenharmony_ci time_before(jiffies, 30462306a36Sopenharmony_ci fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) 30562306a36Sopenharmony_ci return; 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci /* 30862306a36Sopenharmony_ci * Issue flush and toggle pending_idx. This makes pending_idx 30962306a36Sopenharmony_ci * different from running_idx, which means flush is in flight. 31062306a36Sopenharmony_ci */ 31162306a36Sopenharmony_ci fq->flush_pending_idx ^= 1; 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci blk_rq_init(q, flush_rq); 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci /* 31662306a36Sopenharmony_ci * In case of none scheduler, borrow tag from the first request 31762306a36Sopenharmony_ci * since they can't be in flight at the same time. And acquire 31862306a36Sopenharmony_ci * the tag's ownership for flush req. 31962306a36Sopenharmony_ci * 32062306a36Sopenharmony_ci * In case of IO scheduler, flush rq need to borrow scheduler tag 32162306a36Sopenharmony_ci * just for cheating put/get driver tag. 32262306a36Sopenharmony_ci */ 32362306a36Sopenharmony_ci flush_rq->mq_ctx = first_rq->mq_ctx; 32462306a36Sopenharmony_ci flush_rq->mq_hctx = first_rq->mq_hctx; 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci if (!q->elevator) { 32762306a36Sopenharmony_ci flush_rq->tag = first_rq->tag; 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci /* 33062306a36Sopenharmony_ci * We borrow data request's driver tag, so have to mark 33162306a36Sopenharmony_ci * this flush request as INFLIGHT for avoiding double 33262306a36Sopenharmony_ci * account of this driver tag 33362306a36Sopenharmony_ci */ 33462306a36Sopenharmony_ci flush_rq->rq_flags |= RQF_MQ_INFLIGHT; 33562306a36Sopenharmony_ci } else 33662306a36Sopenharmony_ci flush_rq->internal_tag = first_rq->internal_tag; 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; 33962306a36Sopenharmony_ci flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); 34062306a36Sopenharmony_ci flush_rq->rq_flags |= RQF_FLUSH_SEQ; 34162306a36Sopenharmony_ci flush_rq->end_io = flush_end_io; 34262306a36Sopenharmony_ci /* 34362306a36Sopenharmony_ci * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one 34462306a36Sopenharmony_ci * implied in refcount_inc_not_zero() called from 34562306a36Sopenharmony_ci * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref 34662306a36Sopenharmony_ci * and READ flush_rq->end_io 34762306a36Sopenharmony_ci */ 34862306a36Sopenharmony_ci smp_wmb(); 34962306a36Sopenharmony_ci req_ref_set(flush_rq, 1); 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_ci spin_lock(&q->requeue_lock); 35262306a36Sopenharmony_ci list_add_tail(&flush_rq->queuelist, &q->flush_list); 35362306a36Sopenharmony_ci spin_unlock(&q->requeue_lock); 35462306a36Sopenharmony_ci 35562306a36Sopenharmony_ci blk_mq_kick_requeue_list(q); 35662306a36Sopenharmony_ci} 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_cistatic enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, 35962306a36Sopenharmony_ci blk_status_t error) 36062306a36Sopenharmony_ci{ 36162306a36Sopenharmony_ci struct request_queue *q = rq->q; 36262306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 36362306a36Sopenharmony_ci struct blk_mq_ctx *ctx = rq->mq_ctx; 36462306a36Sopenharmony_ci unsigned long flags; 36562306a36Sopenharmony_ci struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci if (q->elevator) { 36862306a36Sopenharmony_ci WARN_ON(rq->tag < 0); 36962306a36Sopenharmony_ci blk_mq_put_driver_tag(rq); 37062306a36Sopenharmony_ci } 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci /* 37362306a36Sopenharmony_ci * After populating an empty queue, kick it to avoid stall. Read 37462306a36Sopenharmony_ci * the comment in flush_end_io(). 37562306a36Sopenharmony_ci */ 37662306a36Sopenharmony_ci spin_lock_irqsave(&fq->mq_flush_lock, flags); 37762306a36Sopenharmony_ci fq->flush_data_in_flight--; 37862306a36Sopenharmony_ci /* 37962306a36Sopenharmony_ci * May have been corrupted by rq->rq_next reuse, we need to 38062306a36Sopenharmony_ci * re-initialize rq->queuelist before reusing it here. 38162306a36Sopenharmony_ci */ 38262306a36Sopenharmony_ci INIT_LIST_HEAD(&rq->queuelist); 38362306a36Sopenharmony_ci blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); 38462306a36Sopenharmony_ci spin_unlock_irqrestore(&fq->mq_flush_lock, flags); 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci blk_mq_sched_restart(hctx); 38762306a36Sopenharmony_ci return RQ_END_IO_NONE; 38862306a36Sopenharmony_ci} 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_cistatic void blk_rq_init_flush(struct request *rq) 39162306a36Sopenharmony_ci{ 39262306a36Sopenharmony_ci rq->flush.seq = 0; 39362306a36Sopenharmony_ci rq->rq_flags |= RQF_FLUSH_SEQ; 39462306a36Sopenharmony_ci rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 39562306a36Sopenharmony_ci rq->end_io = mq_flush_data_end_io; 39662306a36Sopenharmony_ci} 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_ci/* 39962306a36Sopenharmony_ci * Insert a PREFLUSH/FUA request into the flush state machine. 40062306a36Sopenharmony_ci * Returns true if the request has been consumed by the flush state machine, 40162306a36Sopenharmony_ci * or false if the caller should continue to process it. 40262306a36Sopenharmony_ci */ 40362306a36Sopenharmony_cibool blk_insert_flush(struct request *rq) 40462306a36Sopenharmony_ci{ 40562306a36Sopenharmony_ci struct request_queue *q = rq->q; 40662306a36Sopenharmony_ci unsigned long fflags = q->queue_flags; /* may change, cache */ 40762306a36Sopenharmony_ci unsigned int policy = blk_flush_policy(fflags, rq); 40862306a36Sopenharmony_ci struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci /* FLUSH/FUA request must never be merged */ 41162306a36Sopenharmony_ci WARN_ON_ONCE(rq->bio != rq->biotail); 41262306a36Sopenharmony_ci 41362306a36Sopenharmony_ci /* 41462306a36Sopenharmony_ci * @policy now records what operations need to be done. Adjust 41562306a36Sopenharmony_ci * REQ_PREFLUSH and FUA for the driver. 41662306a36Sopenharmony_ci */ 41762306a36Sopenharmony_ci rq->cmd_flags &= ~REQ_PREFLUSH; 41862306a36Sopenharmony_ci if (!(fflags & (1UL << QUEUE_FLAG_FUA))) 41962306a36Sopenharmony_ci rq->cmd_flags &= ~REQ_FUA; 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_ci /* 42262306a36Sopenharmony_ci * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any 42362306a36Sopenharmony_ci * of those flags, we have to set REQ_SYNC to avoid skewing 42462306a36Sopenharmony_ci * the request accounting. 42562306a36Sopenharmony_ci */ 42662306a36Sopenharmony_ci rq->cmd_flags |= REQ_SYNC; 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_ci switch (policy) { 42962306a36Sopenharmony_ci case 0: 43062306a36Sopenharmony_ci /* 43162306a36Sopenharmony_ci * An empty flush handed down from a stacking driver may 43262306a36Sopenharmony_ci * translate into nothing if the underlying device does not 43362306a36Sopenharmony_ci * advertise a write-back cache. In this case, simply 43462306a36Sopenharmony_ci * complete the request. 43562306a36Sopenharmony_ci */ 43662306a36Sopenharmony_ci blk_mq_end_request(rq, 0); 43762306a36Sopenharmony_ci return true; 43862306a36Sopenharmony_ci case REQ_FSEQ_DATA: 43962306a36Sopenharmony_ci /* 44062306a36Sopenharmony_ci * If there's data, but no flush is necessary, the request can 44162306a36Sopenharmony_ci * be processed directly without going through flush machinery. 44262306a36Sopenharmony_ci * Queue for normal execution. 44362306a36Sopenharmony_ci */ 44462306a36Sopenharmony_ci return false; 44562306a36Sopenharmony_ci case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH: 44662306a36Sopenharmony_ci /* 44762306a36Sopenharmony_ci * Initialize the flush fields and completion handler to trigger 44862306a36Sopenharmony_ci * the post flush, and then just pass the command on. 44962306a36Sopenharmony_ci */ 45062306a36Sopenharmony_ci blk_rq_init_flush(rq); 45162306a36Sopenharmony_ci rq->flush.seq |= REQ_FSEQ_PREFLUSH; 45262306a36Sopenharmony_ci spin_lock_irq(&fq->mq_flush_lock); 45362306a36Sopenharmony_ci fq->flush_data_in_flight++; 45462306a36Sopenharmony_ci spin_unlock_irq(&fq->mq_flush_lock); 45562306a36Sopenharmony_ci return false; 45662306a36Sopenharmony_ci default: 45762306a36Sopenharmony_ci /* 45862306a36Sopenharmony_ci * Mark the request as part of a flush sequence and submit it 45962306a36Sopenharmony_ci * for further processing to the flush state machine. 46062306a36Sopenharmony_ci */ 46162306a36Sopenharmony_ci blk_rq_init_flush(rq); 46262306a36Sopenharmony_ci spin_lock_irq(&fq->mq_flush_lock); 46362306a36Sopenharmony_ci blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); 46462306a36Sopenharmony_ci spin_unlock_irq(&fq->mq_flush_lock); 46562306a36Sopenharmony_ci return true; 46662306a36Sopenharmony_ci } 46762306a36Sopenharmony_ci} 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci/** 47062306a36Sopenharmony_ci * blkdev_issue_flush - queue a flush 47162306a36Sopenharmony_ci * @bdev: blockdev to issue flush for 47262306a36Sopenharmony_ci * 47362306a36Sopenharmony_ci * Description: 47462306a36Sopenharmony_ci * Issue a flush for the block device in question. 47562306a36Sopenharmony_ci */ 47662306a36Sopenharmony_ciint blkdev_issue_flush(struct block_device *bdev) 47762306a36Sopenharmony_ci{ 47862306a36Sopenharmony_ci struct bio bio; 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_ci bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH); 48162306a36Sopenharmony_ci return submit_bio_wait(&bio); 48262306a36Sopenharmony_ci} 48362306a36Sopenharmony_ciEXPORT_SYMBOL(blkdev_issue_flush); 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_cistruct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, 48662306a36Sopenharmony_ci gfp_t flags) 48762306a36Sopenharmony_ci{ 48862306a36Sopenharmony_ci struct blk_flush_queue *fq; 48962306a36Sopenharmony_ci int rq_sz = sizeof(struct request); 49062306a36Sopenharmony_ci 49162306a36Sopenharmony_ci fq = kzalloc_node(sizeof(*fq), flags, node); 49262306a36Sopenharmony_ci if (!fq) 49362306a36Sopenharmony_ci goto fail; 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci spin_lock_init(&fq->mq_flush_lock); 49662306a36Sopenharmony_ci 49762306a36Sopenharmony_ci rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); 49862306a36Sopenharmony_ci fq->flush_rq = kzalloc_node(rq_sz, flags, node); 49962306a36Sopenharmony_ci if (!fq->flush_rq) 50062306a36Sopenharmony_ci goto fail_rq; 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci INIT_LIST_HEAD(&fq->flush_queue[0]); 50362306a36Sopenharmony_ci INIT_LIST_HEAD(&fq->flush_queue[1]); 50462306a36Sopenharmony_ci 50562306a36Sopenharmony_ci return fq; 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci fail_rq: 50862306a36Sopenharmony_ci kfree(fq); 50962306a36Sopenharmony_ci fail: 51062306a36Sopenharmony_ci return NULL; 51162306a36Sopenharmony_ci} 51262306a36Sopenharmony_ci 51362306a36Sopenharmony_civoid blk_free_flush_queue(struct blk_flush_queue *fq) 51462306a36Sopenharmony_ci{ 51562306a36Sopenharmony_ci /* bio based request queue hasn't flush queue */ 51662306a36Sopenharmony_ci if (!fq) 51762306a36Sopenharmony_ci return; 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_ci kfree(fq->flush_rq); 52062306a36Sopenharmony_ci kfree(fq); 52162306a36Sopenharmony_ci} 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci/* 52462306a36Sopenharmony_ci * Allow driver to set its own lock class to fq->mq_flush_lock for 52562306a36Sopenharmony_ci * avoiding lockdep complaint. 52662306a36Sopenharmony_ci * 52762306a36Sopenharmony_ci * flush_end_io() may be called recursively from some driver, such as 52862306a36Sopenharmony_ci * nvme-loop, so lockdep may complain 'possible recursive locking' because 52962306a36Sopenharmony_ci * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class 53062306a36Sopenharmony_ci * key. We need to assign different lock class for these driver's 53162306a36Sopenharmony_ci * fq->mq_flush_lock for avoiding the lockdep warning. 53262306a36Sopenharmony_ci * 53362306a36Sopenharmony_ci * Use dynamically allocated lock class key for each 'blk_flush_queue' 53462306a36Sopenharmony_ci * instance is over-kill, and more worse it introduces horrible boot delay 53562306a36Sopenharmony_ci * issue because synchronize_rcu() is implied in lockdep_unregister_key which 53662306a36Sopenharmony_ci * is called for each hctx release. SCSI probing may synchronously create and 53762306a36Sopenharmony_ci * destroy lots of MQ request_queues for non-existent devices, and some robot 53862306a36Sopenharmony_ci * test kernel always enable lockdep option. It is observed that more than half 53962306a36Sopenharmony_ci * an hour is taken during SCSI MQ probe with per-fq lock class. 54062306a36Sopenharmony_ci */ 54162306a36Sopenharmony_civoid blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, 54262306a36Sopenharmony_ci struct lock_class_key *key) 54362306a36Sopenharmony_ci{ 54462306a36Sopenharmony_ci lockdep_set_class(&hctx->fq->mq_flush_lock, key); 54562306a36Sopenharmony_ci} 54662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class); 547