162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * The Kyber I/O scheduler. Controls latency by throttling queue depths using 462306a36Sopenharmony_ci * scalable techniques. 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Copyright (C) 2017 Facebook 762306a36Sopenharmony_ci */ 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#include <linux/kernel.h> 1062306a36Sopenharmony_ci#include <linux/blkdev.h> 1162306a36Sopenharmony_ci#include <linux/module.h> 1262306a36Sopenharmony_ci#include <linux/sbitmap.h> 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci#include <trace/events/block.h> 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ci#include "elevator.h" 1762306a36Sopenharmony_ci#include "blk.h" 1862306a36Sopenharmony_ci#include "blk-mq.h" 1962306a36Sopenharmony_ci#include "blk-mq-debugfs.h" 2062306a36Sopenharmony_ci#include "blk-mq-sched.h" 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_ci#define CREATE_TRACE_POINTS 2362306a36Sopenharmony_ci#include <trace/events/kyber.h> 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_ci/* 2662306a36Sopenharmony_ci * Scheduling domains: the device is divided into multiple domains based on the 2762306a36Sopenharmony_ci * request type. 2862306a36Sopenharmony_ci */ 2962306a36Sopenharmony_cienum { 3062306a36Sopenharmony_ci KYBER_READ, 3162306a36Sopenharmony_ci KYBER_WRITE, 3262306a36Sopenharmony_ci KYBER_DISCARD, 3362306a36Sopenharmony_ci KYBER_OTHER, 3462306a36Sopenharmony_ci KYBER_NUM_DOMAINS, 3562306a36Sopenharmony_ci}; 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_cistatic const char *kyber_domain_names[] = { 3862306a36Sopenharmony_ci [KYBER_READ] = "READ", 3962306a36Sopenharmony_ci [KYBER_WRITE] = "WRITE", 4062306a36Sopenharmony_ci [KYBER_DISCARD] = "DISCARD", 4162306a36Sopenharmony_ci [KYBER_OTHER] = "OTHER", 4262306a36Sopenharmony_ci}; 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_cienum { 4562306a36Sopenharmony_ci /* 4662306a36Sopenharmony_ci * In order to prevent starvation of synchronous requests by a flood of 4762306a36Sopenharmony_ci * asynchronous requests, we reserve 25% of requests for synchronous 4862306a36Sopenharmony_ci * operations. 4962306a36Sopenharmony_ci */ 5062306a36Sopenharmony_ci KYBER_ASYNC_PERCENT = 75, 5162306a36Sopenharmony_ci}; 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci/* 5462306a36Sopenharmony_ci * Maximum device-wide depth for each scheduling domain. 5562306a36Sopenharmony_ci * 5662306a36Sopenharmony_ci * Even for fast devices with lots of tags like NVMe, you can saturate the 5762306a36Sopenharmony_ci * device with only a fraction of the maximum possible queue depth. So, we cap 5862306a36Sopenharmony_ci * these to a reasonable value. 5962306a36Sopenharmony_ci */ 6062306a36Sopenharmony_cistatic const unsigned int kyber_depth[] = { 6162306a36Sopenharmony_ci [KYBER_READ] = 256, 6262306a36Sopenharmony_ci [KYBER_WRITE] = 128, 6362306a36Sopenharmony_ci [KYBER_DISCARD] = 64, 6462306a36Sopenharmony_ci [KYBER_OTHER] = 16, 6562306a36Sopenharmony_ci}; 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci/* 6862306a36Sopenharmony_ci * Default latency targets for each scheduling domain. 6962306a36Sopenharmony_ci */ 7062306a36Sopenharmony_cistatic const u64 kyber_latency_targets[] = { 7162306a36Sopenharmony_ci [KYBER_READ] = 2ULL * NSEC_PER_MSEC, 7262306a36Sopenharmony_ci [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC, 7362306a36Sopenharmony_ci [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC, 7462306a36Sopenharmony_ci}; 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci/* 7762306a36Sopenharmony_ci * Batch size (number of requests we'll dispatch in a row) for each scheduling 7862306a36Sopenharmony_ci * domain. 7962306a36Sopenharmony_ci */ 8062306a36Sopenharmony_cistatic const unsigned int kyber_batch_size[] = { 8162306a36Sopenharmony_ci [KYBER_READ] = 16, 8262306a36Sopenharmony_ci [KYBER_WRITE] = 8, 8362306a36Sopenharmony_ci [KYBER_DISCARD] = 1, 8462306a36Sopenharmony_ci [KYBER_OTHER] = 1, 8562306a36Sopenharmony_ci}; 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci/* 8862306a36Sopenharmony_ci * Requests latencies are recorded in a histogram with buckets defined relative 8962306a36Sopenharmony_ci * to the target latency: 9062306a36Sopenharmony_ci * 9162306a36Sopenharmony_ci * <= 1/4 * target latency 9262306a36Sopenharmony_ci * <= 1/2 * target latency 9362306a36Sopenharmony_ci * <= 3/4 * target latency 9462306a36Sopenharmony_ci * <= target latency 9562306a36Sopenharmony_ci * <= 1 1/4 * target latency 9662306a36Sopenharmony_ci * <= 1 1/2 * target latency 9762306a36Sopenharmony_ci * <= 1 3/4 * target latency 9862306a36Sopenharmony_ci * > 1 3/4 * target latency 9962306a36Sopenharmony_ci */ 10062306a36Sopenharmony_cienum { 10162306a36Sopenharmony_ci /* 10262306a36Sopenharmony_ci * The width of the latency histogram buckets is 10362306a36Sopenharmony_ci * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency. 10462306a36Sopenharmony_ci */ 10562306a36Sopenharmony_ci KYBER_LATENCY_SHIFT = 2, 10662306a36Sopenharmony_ci /* 10762306a36Sopenharmony_ci * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency, 10862306a36Sopenharmony_ci * thus, "good". 10962306a36Sopenharmony_ci */ 11062306a36Sopenharmony_ci KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT, 11162306a36Sopenharmony_ci /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */ 11262306a36Sopenharmony_ci KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT, 11362306a36Sopenharmony_ci}; 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci/* 11662306a36Sopenharmony_ci * We measure both the total latency and the I/O latency (i.e., latency after 11762306a36Sopenharmony_ci * submitting to the device). 11862306a36Sopenharmony_ci */ 11962306a36Sopenharmony_cienum { 12062306a36Sopenharmony_ci KYBER_TOTAL_LATENCY, 12162306a36Sopenharmony_ci KYBER_IO_LATENCY, 12262306a36Sopenharmony_ci}; 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_cistatic const char *kyber_latency_type_names[] = { 12562306a36Sopenharmony_ci [KYBER_TOTAL_LATENCY] = "total", 12662306a36Sopenharmony_ci [KYBER_IO_LATENCY] = "I/O", 12762306a36Sopenharmony_ci}; 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci/* 13062306a36Sopenharmony_ci * Per-cpu latency histograms: total latency and I/O latency for each scheduling 13162306a36Sopenharmony_ci * domain except for KYBER_OTHER. 13262306a36Sopenharmony_ci */ 13362306a36Sopenharmony_cistruct kyber_cpu_latency { 13462306a36Sopenharmony_ci atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; 13562306a36Sopenharmony_ci}; 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci/* 13862306a36Sopenharmony_ci * There is a same mapping between ctx & hctx and kcq & khd, 13962306a36Sopenharmony_ci * we use request->mq_ctx->index_hw to index the kcq in khd. 14062306a36Sopenharmony_ci */ 14162306a36Sopenharmony_cistruct kyber_ctx_queue { 14262306a36Sopenharmony_ci /* 14362306a36Sopenharmony_ci * Used to ensure operations on rq_list and kcq_map to be an atmoic one. 14462306a36Sopenharmony_ci * Also protect the rqs on rq_list when merge. 14562306a36Sopenharmony_ci */ 14662306a36Sopenharmony_ci spinlock_t lock; 14762306a36Sopenharmony_ci struct list_head rq_list[KYBER_NUM_DOMAINS]; 14862306a36Sopenharmony_ci} ____cacheline_aligned_in_smp; 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_cistruct kyber_queue_data { 15162306a36Sopenharmony_ci struct request_queue *q; 15262306a36Sopenharmony_ci dev_t dev; 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci /* 15562306a36Sopenharmony_ci * Each scheduling domain has a limited number of in-flight requests 15662306a36Sopenharmony_ci * device-wide, limited by these tokens. 15762306a36Sopenharmony_ci */ 15862306a36Sopenharmony_ci struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci /* 16162306a36Sopenharmony_ci * Async request percentage, converted to per-word depth for 16262306a36Sopenharmony_ci * sbitmap_get_shallow(). 16362306a36Sopenharmony_ci */ 16462306a36Sopenharmony_ci unsigned int async_depth; 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci struct kyber_cpu_latency __percpu *cpu_latency; 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci /* Timer for stats aggregation and adjusting domain tokens. */ 16962306a36Sopenharmony_ci struct timer_list timer; 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci unsigned long latency_timeout[KYBER_OTHER]; 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci int domain_p99[KYBER_OTHER]; 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci /* Target latencies in nanoseconds. */ 17862306a36Sopenharmony_ci u64 latency_targets[KYBER_OTHER]; 17962306a36Sopenharmony_ci}; 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_cistruct kyber_hctx_data { 18262306a36Sopenharmony_ci spinlock_t lock; 18362306a36Sopenharmony_ci struct list_head rqs[KYBER_NUM_DOMAINS]; 18462306a36Sopenharmony_ci unsigned int cur_domain; 18562306a36Sopenharmony_ci unsigned int batching; 18662306a36Sopenharmony_ci struct kyber_ctx_queue *kcqs; 18762306a36Sopenharmony_ci struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; 18862306a36Sopenharmony_ci struct sbq_wait domain_wait[KYBER_NUM_DOMAINS]; 18962306a36Sopenharmony_ci struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; 19062306a36Sopenharmony_ci atomic_t wait_index[KYBER_NUM_DOMAINS]; 19162306a36Sopenharmony_ci}; 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_cistatic int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, 19462306a36Sopenharmony_ci void *key); 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_cistatic unsigned int kyber_sched_domain(blk_opf_t opf) 19762306a36Sopenharmony_ci{ 19862306a36Sopenharmony_ci switch (opf & REQ_OP_MASK) { 19962306a36Sopenharmony_ci case REQ_OP_READ: 20062306a36Sopenharmony_ci return KYBER_READ; 20162306a36Sopenharmony_ci case REQ_OP_WRITE: 20262306a36Sopenharmony_ci return KYBER_WRITE; 20362306a36Sopenharmony_ci case REQ_OP_DISCARD: 20462306a36Sopenharmony_ci return KYBER_DISCARD; 20562306a36Sopenharmony_ci default: 20662306a36Sopenharmony_ci return KYBER_OTHER; 20762306a36Sopenharmony_ci } 20862306a36Sopenharmony_ci} 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_cistatic void flush_latency_buckets(struct kyber_queue_data *kqd, 21162306a36Sopenharmony_ci struct kyber_cpu_latency *cpu_latency, 21262306a36Sopenharmony_ci unsigned int sched_domain, unsigned int type) 21362306a36Sopenharmony_ci{ 21462306a36Sopenharmony_ci unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; 21562306a36Sopenharmony_ci atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type]; 21662306a36Sopenharmony_ci unsigned int bucket; 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_ci for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) 21962306a36Sopenharmony_ci buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0); 22062306a36Sopenharmony_ci} 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci/* 22362306a36Sopenharmony_ci * Calculate the histogram bucket with the given percentile rank, or -1 if there 22462306a36Sopenharmony_ci * aren't enough samples yet. 22562306a36Sopenharmony_ci */ 22662306a36Sopenharmony_cistatic int calculate_percentile(struct kyber_queue_data *kqd, 22762306a36Sopenharmony_ci unsigned int sched_domain, unsigned int type, 22862306a36Sopenharmony_ci unsigned int percentile) 22962306a36Sopenharmony_ci{ 23062306a36Sopenharmony_ci unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; 23162306a36Sopenharmony_ci unsigned int bucket, samples = 0, percentile_samples; 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) 23462306a36Sopenharmony_ci samples += buckets[bucket]; 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci if (!samples) 23762306a36Sopenharmony_ci return -1; 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci /* 24062306a36Sopenharmony_ci * We do the calculation once we have 500 samples or one second passes 24162306a36Sopenharmony_ci * since the first sample was recorded, whichever comes first. 24262306a36Sopenharmony_ci */ 24362306a36Sopenharmony_ci if (!kqd->latency_timeout[sched_domain]) 24462306a36Sopenharmony_ci kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL); 24562306a36Sopenharmony_ci if (samples < 500 && 24662306a36Sopenharmony_ci time_is_after_jiffies(kqd->latency_timeout[sched_domain])) { 24762306a36Sopenharmony_ci return -1; 24862306a36Sopenharmony_ci } 24962306a36Sopenharmony_ci kqd->latency_timeout[sched_domain] = 0; 25062306a36Sopenharmony_ci 25162306a36Sopenharmony_ci percentile_samples = DIV_ROUND_UP(samples * percentile, 100); 25262306a36Sopenharmony_ci for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) { 25362306a36Sopenharmony_ci if (buckets[bucket] >= percentile_samples) 25462306a36Sopenharmony_ci break; 25562306a36Sopenharmony_ci percentile_samples -= buckets[bucket]; 25662306a36Sopenharmony_ci } 25762306a36Sopenharmony_ci memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain], 26062306a36Sopenharmony_ci kyber_latency_type_names[type], percentile, 26162306a36Sopenharmony_ci bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples); 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci return bucket; 26462306a36Sopenharmony_ci} 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_cistatic void kyber_resize_domain(struct kyber_queue_data *kqd, 26762306a36Sopenharmony_ci unsigned int sched_domain, unsigned int depth) 26862306a36Sopenharmony_ci{ 26962306a36Sopenharmony_ci depth = clamp(depth, 1U, kyber_depth[sched_domain]); 27062306a36Sopenharmony_ci if (depth != kqd->domain_tokens[sched_domain].sb.depth) { 27162306a36Sopenharmony_ci sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); 27262306a36Sopenharmony_ci trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain], 27362306a36Sopenharmony_ci depth); 27462306a36Sopenharmony_ci } 27562306a36Sopenharmony_ci} 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_cistatic void kyber_timer_fn(struct timer_list *t) 27862306a36Sopenharmony_ci{ 27962306a36Sopenharmony_ci struct kyber_queue_data *kqd = from_timer(kqd, t, timer); 28062306a36Sopenharmony_ci unsigned int sched_domain; 28162306a36Sopenharmony_ci int cpu; 28262306a36Sopenharmony_ci bool bad = false; 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci /* Sum all of the per-cpu latency histograms. */ 28562306a36Sopenharmony_ci for_each_online_cpu(cpu) { 28662306a36Sopenharmony_ci struct kyber_cpu_latency *cpu_latency; 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu); 28962306a36Sopenharmony_ci for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { 29062306a36Sopenharmony_ci flush_latency_buckets(kqd, cpu_latency, sched_domain, 29162306a36Sopenharmony_ci KYBER_TOTAL_LATENCY); 29262306a36Sopenharmony_ci flush_latency_buckets(kqd, cpu_latency, sched_domain, 29362306a36Sopenharmony_ci KYBER_IO_LATENCY); 29462306a36Sopenharmony_ci } 29562306a36Sopenharmony_ci } 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci /* 29862306a36Sopenharmony_ci * Check if any domains have a high I/O latency, which might indicate 29962306a36Sopenharmony_ci * congestion in the device. Note that we use the p90; we don't want to 30062306a36Sopenharmony_ci * be too sensitive to outliers here. 30162306a36Sopenharmony_ci */ 30262306a36Sopenharmony_ci for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { 30362306a36Sopenharmony_ci int p90; 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY, 30662306a36Sopenharmony_ci 90); 30762306a36Sopenharmony_ci if (p90 >= KYBER_GOOD_BUCKETS) 30862306a36Sopenharmony_ci bad = true; 30962306a36Sopenharmony_ci } 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci /* 31262306a36Sopenharmony_ci * Adjust the scheduling domain depths. If we determined that there was 31362306a36Sopenharmony_ci * congestion, we throttle all domains with good latencies. Either way, 31462306a36Sopenharmony_ci * we ease up on throttling domains with bad latencies. 31562306a36Sopenharmony_ci */ 31662306a36Sopenharmony_ci for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { 31762306a36Sopenharmony_ci unsigned int orig_depth, depth; 31862306a36Sopenharmony_ci int p99; 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci p99 = calculate_percentile(kqd, sched_domain, 32162306a36Sopenharmony_ci KYBER_TOTAL_LATENCY, 99); 32262306a36Sopenharmony_ci /* 32362306a36Sopenharmony_ci * This is kind of subtle: different domains will not 32462306a36Sopenharmony_ci * necessarily have enough samples to calculate the latency 32562306a36Sopenharmony_ci * percentiles during the same window, so we have to remember 32662306a36Sopenharmony_ci * the p99 for the next time we observe congestion; once we do, 32762306a36Sopenharmony_ci * we don't want to throttle again until we get more data, so we 32862306a36Sopenharmony_ci * reset it to -1. 32962306a36Sopenharmony_ci */ 33062306a36Sopenharmony_ci if (bad) { 33162306a36Sopenharmony_ci if (p99 < 0) 33262306a36Sopenharmony_ci p99 = kqd->domain_p99[sched_domain]; 33362306a36Sopenharmony_ci kqd->domain_p99[sched_domain] = -1; 33462306a36Sopenharmony_ci } else if (p99 >= 0) { 33562306a36Sopenharmony_ci kqd->domain_p99[sched_domain] = p99; 33662306a36Sopenharmony_ci } 33762306a36Sopenharmony_ci if (p99 < 0) 33862306a36Sopenharmony_ci continue; 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_ci /* 34162306a36Sopenharmony_ci * If this domain has bad latency, throttle less. Otherwise, 34262306a36Sopenharmony_ci * throttle more iff we determined that there is congestion. 34362306a36Sopenharmony_ci * 34462306a36Sopenharmony_ci * The new depth is scaled linearly with the p99 latency vs the 34562306a36Sopenharmony_ci * latency target. E.g., if the p99 is 3/4 of the target, then 34662306a36Sopenharmony_ci * we throttle down to 3/4 of the current depth, and if the p99 34762306a36Sopenharmony_ci * is 2x the target, then we double the depth. 34862306a36Sopenharmony_ci */ 34962306a36Sopenharmony_ci if (bad || p99 >= KYBER_GOOD_BUCKETS) { 35062306a36Sopenharmony_ci orig_depth = kqd->domain_tokens[sched_domain].sb.depth; 35162306a36Sopenharmony_ci depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT; 35262306a36Sopenharmony_ci kyber_resize_domain(kqd, sched_domain, depth); 35362306a36Sopenharmony_ci } 35462306a36Sopenharmony_ci } 35562306a36Sopenharmony_ci} 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_cistatic struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) 35862306a36Sopenharmony_ci{ 35962306a36Sopenharmony_ci struct kyber_queue_data *kqd; 36062306a36Sopenharmony_ci int ret = -ENOMEM; 36162306a36Sopenharmony_ci int i; 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); 36462306a36Sopenharmony_ci if (!kqd) 36562306a36Sopenharmony_ci goto err; 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci kqd->q = q; 36862306a36Sopenharmony_ci kqd->dev = disk_devt(q->disk); 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, 37162306a36Sopenharmony_ci GFP_KERNEL | __GFP_ZERO); 37262306a36Sopenharmony_ci if (!kqd->cpu_latency) 37362306a36Sopenharmony_ci goto err_kqd; 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_ci timer_setup(&kqd->timer, kyber_timer_fn, 0); 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 37862306a36Sopenharmony_ci WARN_ON(!kyber_depth[i]); 37962306a36Sopenharmony_ci WARN_ON(!kyber_batch_size[i]); 38062306a36Sopenharmony_ci ret = sbitmap_queue_init_node(&kqd->domain_tokens[i], 38162306a36Sopenharmony_ci kyber_depth[i], -1, false, 38262306a36Sopenharmony_ci GFP_KERNEL, q->node); 38362306a36Sopenharmony_ci if (ret) { 38462306a36Sopenharmony_ci while (--i >= 0) 38562306a36Sopenharmony_ci sbitmap_queue_free(&kqd->domain_tokens[i]); 38662306a36Sopenharmony_ci goto err_buckets; 38762306a36Sopenharmony_ci } 38862306a36Sopenharmony_ci } 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci for (i = 0; i < KYBER_OTHER; i++) { 39162306a36Sopenharmony_ci kqd->domain_p99[i] = -1; 39262306a36Sopenharmony_ci kqd->latency_targets[i] = kyber_latency_targets[i]; 39362306a36Sopenharmony_ci } 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci return kqd; 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_cierr_buckets: 39862306a36Sopenharmony_ci free_percpu(kqd->cpu_latency); 39962306a36Sopenharmony_cierr_kqd: 40062306a36Sopenharmony_ci kfree(kqd); 40162306a36Sopenharmony_cierr: 40262306a36Sopenharmony_ci return ERR_PTR(ret); 40362306a36Sopenharmony_ci} 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_cistatic int kyber_init_sched(struct request_queue *q, struct elevator_type *e) 40662306a36Sopenharmony_ci{ 40762306a36Sopenharmony_ci struct kyber_queue_data *kqd; 40862306a36Sopenharmony_ci struct elevator_queue *eq; 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci eq = elevator_alloc(q, e); 41162306a36Sopenharmony_ci if (!eq) 41262306a36Sopenharmony_ci return -ENOMEM; 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci kqd = kyber_queue_data_alloc(q); 41562306a36Sopenharmony_ci if (IS_ERR(kqd)) { 41662306a36Sopenharmony_ci kobject_put(&eq->kobj); 41762306a36Sopenharmony_ci return PTR_ERR(kqd); 41862306a36Sopenharmony_ci } 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci blk_stat_enable_accounting(q); 42162306a36Sopenharmony_ci 42262306a36Sopenharmony_ci blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci eq->elevator_data = kqd; 42562306a36Sopenharmony_ci q->elevator = eq; 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci return 0; 42862306a36Sopenharmony_ci} 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_cistatic void kyber_exit_sched(struct elevator_queue *e) 43162306a36Sopenharmony_ci{ 43262306a36Sopenharmony_ci struct kyber_queue_data *kqd = e->elevator_data; 43362306a36Sopenharmony_ci int i; 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci timer_shutdown_sync(&kqd->timer); 43662306a36Sopenharmony_ci blk_stat_disable_accounting(kqd->q); 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci for (i = 0; i < KYBER_NUM_DOMAINS; i++) 43962306a36Sopenharmony_ci sbitmap_queue_free(&kqd->domain_tokens[i]); 44062306a36Sopenharmony_ci free_percpu(kqd->cpu_latency); 44162306a36Sopenharmony_ci kfree(kqd); 44262306a36Sopenharmony_ci} 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_cistatic void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) 44562306a36Sopenharmony_ci{ 44662306a36Sopenharmony_ci unsigned int i; 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci spin_lock_init(&kcq->lock); 44962306a36Sopenharmony_ci for (i = 0; i < KYBER_NUM_DOMAINS; i++) 45062306a36Sopenharmony_ci INIT_LIST_HEAD(&kcq->rq_list[i]); 45162306a36Sopenharmony_ci} 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_cistatic void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) 45462306a36Sopenharmony_ci{ 45562306a36Sopenharmony_ci struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; 45662306a36Sopenharmony_ci struct blk_mq_tags *tags = hctx->sched_tags; 45762306a36Sopenharmony_ci unsigned int shift = tags->bitmap_tags.sb.shift; 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_ci kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_ci sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); 46262306a36Sopenharmony_ci} 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_cistatic int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 46562306a36Sopenharmony_ci{ 46662306a36Sopenharmony_ci struct kyber_hctx_data *khd; 46762306a36Sopenharmony_ci int i; 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node); 47062306a36Sopenharmony_ci if (!khd) 47162306a36Sopenharmony_ci return -ENOMEM; 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci khd->kcqs = kmalloc_array_node(hctx->nr_ctx, 47462306a36Sopenharmony_ci sizeof(struct kyber_ctx_queue), 47562306a36Sopenharmony_ci GFP_KERNEL, hctx->numa_node); 47662306a36Sopenharmony_ci if (!khd->kcqs) 47762306a36Sopenharmony_ci goto err_khd; 47862306a36Sopenharmony_ci 47962306a36Sopenharmony_ci for (i = 0; i < hctx->nr_ctx; i++) 48062306a36Sopenharmony_ci kyber_ctx_queue_init(&khd->kcqs[i]); 48162306a36Sopenharmony_ci 48262306a36Sopenharmony_ci for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 48362306a36Sopenharmony_ci if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx, 48462306a36Sopenharmony_ci ilog2(8), GFP_KERNEL, hctx->numa_node, 48562306a36Sopenharmony_ci false, false)) { 48662306a36Sopenharmony_ci while (--i >= 0) 48762306a36Sopenharmony_ci sbitmap_free(&khd->kcq_map[i]); 48862306a36Sopenharmony_ci goto err_kcqs; 48962306a36Sopenharmony_ci } 49062306a36Sopenharmony_ci } 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci spin_lock_init(&khd->lock); 49362306a36Sopenharmony_ci 49462306a36Sopenharmony_ci for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 49562306a36Sopenharmony_ci INIT_LIST_HEAD(&khd->rqs[i]); 49662306a36Sopenharmony_ci khd->domain_wait[i].sbq = NULL; 49762306a36Sopenharmony_ci init_waitqueue_func_entry(&khd->domain_wait[i].wait, 49862306a36Sopenharmony_ci kyber_domain_wake); 49962306a36Sopenharmony_ci khd->domain_wait[i].wait.private = hctx; 50062306a36Sopenharmony_ci INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry); 50162306a36Sopenharmony_ci atomic_set(&khd->wait_index[i], 0); 50262306a36Sopenharmony_ci } 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_ci khd->cur_domain = 0; 50562306a36Sopenharmony_ci khd->batching = 0; 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci hctx->sched_data = khd; 50862306a36Sopenharmony_ci kyber_depth_updated(hctx); 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci return 0; 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_cierr_kcqs: 51362306a36Sopenharmony_ci kfree(khd->kcqs); 51462306a36Sopenharmony_cierr_khd: 51562306a36Sopenharmony_ci kfree(khd); 51662306a36Sopenharmony_ci return -ENOMEM; 51762306a36Sopenharmony_ci} 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_cistatic void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 52062306a36Sopenharmony_ci{ 52162306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; 52262306a36Sopenharmony_ci int i; 52362306a36Sopenharmony_ci 52462306a36Sopenharmony_ci for (i = 0; i < KYBER_NUM_DOMAINS; i++) 52562306a36Sopenharmony_ci sbitmap_free(&khd->kcq_map[i]); 52662306a36Sopenharmony_ci kfree(khd->kcqs); 52762306a36Sopenharmony_ci kfree(hctx->sched_data); 52862306a36Sopenharmony_ci} 52962306a36Sopenharmony_ci 53062306a36Sopenharmony_cistatic int rq_get_domain_token(struct request *rq) 53162306a36Sopenharmony_ci{ 53262306a36Sopenharmony_ci return (long)rq->elv.priv[0]; 53362306a36Sopenharmony_ci} 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_cistatic void rq_set_domain_token(struct request *rq, int token) 53662306a36Sopenharmony_ci{ 53762306a36Sopenharmony_ci rq->elv.priv[0] = (void *)(long)token; 53862306a36Sopenharmony_ci} 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_cistatic void rq_clear_domain_token(struct kyber_queue_data *kqd, 54162306a36Sopenharmony_ci struct request *rq) 54262306a36Sopenharmony_ci{ 54362306a36Sopenharmony_ci unsigned int sched_domain; 54462306a36Sopenharmony_ci int nr; 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_ci nr = rq_get_domain_token(rq); 54762306a36Sopenharmony_ci if (nr != -1) { 54862306a36Sopenharmony_ci sched_domain = kyber_sched_domain(rq->cmd_flags); 54962306a36Sopenharmony_ci sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr, 55062306a36Sopenharmony_ci rq->mq_ctx->cpu); 55162306a36Sopenharmony_ci } 55262306a36Sopenharmony_ci} 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_cistatic void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) 55562306a36Sopenharmony_ci{ 55662306a36Sopenharmony_ci /* 55762306a36Sopenharmony_ci * We use the scheduler tags as per-hardware queue queueing tokens. 55862306a36Sopenharmony_ci * Async requests can be limited at this stage. 55962306a36Sopenharmony_ci */ 56062306a36Sopenharmony_ci if (!op_is_sync(opf)) { 56162306a36Sopenharmony_ci struct kyber_queue_data *kqd = data->q->elevator->elevator_data; 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_ci data->shallow_depth = kqd->async_depth; 56462306a36Sopenharmony_ci } 56562306a36Sopenharmony_ci} 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_cistatic bool kyber_bio_merge(struct request_queue *q, struct bio *bio, 56862306a36Sopenharmony_ci unsigned int nr_segs) 56962306a36Sopenharmony_ci{ 57062306a36Sopenharmony_ci struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 57162306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); 57262306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; 57362306a36Sopenharmony_ci struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; 57462306a36Sopenharmony_ci unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); 57562306a36Sopenharmony_ci struct list_head *rq_list = &kcq->rq_list[sched_domain]; 57662306a36Sopenharmony_ci bool merged; 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci spin_lock(&kcq->lock); 57962306a36Sopenharmony_ci merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); 58062306a36Sopenharmony_ci spin_unlock(&kcq->lock); 58162306a36Sopenharmony_ci 58262306a36Sopenharmony_ci return merged; 58362306a36Sopenharmony_ci} 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_cistatic void kyber_prepare_request(struct request *rq) 58662306a36Sopenharmony_ci{ 58762306a36Sopenharmony_ci rq_set_domain_token(rq, -1); 58862306a36Sopenharmony_ci} 58962306a36Sopenharmony_ci 59062306a36Sopenharmony_cistatic void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, 59162306a36Sopenharmony_ci struct list_head *rq_list, 59262306a36Sopenharmony_ci blk_insert_t flags) 59362306a36Sopenharmony_ci{ 59462306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; 59562306a36Sopenharmony_ci struct request *rq, *next; 59662306a36Sopenharmony_ci 59762306a36Sopenharmony_ci list_for_each_entry_safe(rq, next, rq_list, queuelist) { 59862306a36Sopenharmony_ci unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); 59962306a36Sopenharmony_ci struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]]; 60062306a36Sopenharmony_ci struct list_head *head = &kcq->rq_list[sched_domain]; 60162306a36Sopenharmony_ci 60262306a36Sopenharmony_ci spin_lock(&kcq->lock); 60362306a36Sopenharmony_ci trace_block_rq_insert(rq); 60462306a36Sopenharmony_ci if (flags & BLK_MQ_INSERT_AT_HEAD) 60562306a36Sopenharmony_ci list_move(&rq->queuelist, head); 60662306a36Sopenharmony_ci else 60762306a36Sopenharmony_ci list_move_tail(&rq->queuelist, head); 60862306a36Sopenharmony_ci sbitmap_set_bit(&khd->kcq_map[sched_domain], 60962306a36Sopenharmony_ci rq->mq_ctx->index_hw[hctx->type]); 61062306a36Sopenharmony_ci spin_unlock(&kcq->lock); 61162306a36Sopenharmony_ci } 61262306a36Sopenharmony_ci} 61362306a36Sopenharmony_ci 61462306a36Sopenharmony_cistatic void kyber_finish_request(struct request *rq) 61562306a36Sopenharmony_ci{ 61662306a36Sopenharmony_ci struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; 61762306a36Sopenharmony_ci 61862306a36Sopenharmony_ci rq_clear_domain_token(kqd, rq); 61962306a36Sopenharmony_ci} 62062306a36Sopenharmony_ci 62162306a36Sopenharmony_cistatic void add_latency_sample(struct kyber_cpu_latency *cpu_latency, 62262306a36Sopenharmony_ci unsigned int sched_domain, unsigned int type, 62362306a36Sopenharmony_ci u64 target, u64 latency) 62462306a36Sopenharmony_ci{ 62562306a36Sopenharmony_ci unsigned int bucket; 62662306a36Sopenharmony_ci u64 divisor; 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_ci if (latency > 0) { 62962306a36Sopenharmony_ci divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1); 63062306a36Sopenharmony_ci bucket = min_t(unsigned int, div64_u64(latency - 1, divisor), 63162306a36Sopenharmony_ci KYBER_LATENCY_BUCKETS - 1); 63262306a36Sopenharmony_ci } else { 63362306a36Sopenharmony_ci bucket = 0; 63462306a36Sopenharmony_ci } 63562306a36Sopenharmony_ci 63662306a36Sopenharmony_ci atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]); 63762306a36Sopenharmony_ci} 63862306a36Sopenharmony_ci 63962306a36Sopenharmony_cistatic void kyber_completed_request(struct request *rq, u64 now) 64062306a36Sopenharmony_ci{ 64162306a36Sopenharmony_ci struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; 64262306a36Sopenharmony_ci struct kyber_cpu_latency *cpu_latency; 64362306a36Sopenharmony_ci unsigned int sched_domain; 64462306a36Sopenharmony_ci u64 target; 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_ci sched_domain = kyber_sched_domain(rq->cmd_flags); 64762306a36Sopenharmony_ci if (sched_domain == KYBER_OTHER) 64862306a36Sopenharmony_ci return; 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci cpu_latency = get_cpu_ptr(kqd->cpu_latency); 65162306a36Sopenharmony_ci target = kqd->latency_targets[sched_domain]; 65262306a36Sopenharmony_ci add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY, 65362306a36Sopenharmony_ci target, now - rq->start_time_ns); 65462306a36Sopenharmony_ci add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target, 65562306a36Sopenharmony_ci now - rq->io_start_time_ns); 65662306a36Sopenharmony_ci put_cpu_ptr(kqd->cpu_latency); 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci timer_reduce(&kqd->timer, jiffies + HZ / 10); 65962306a36Sopenharmony_ci} 66062306a36Sopenharmony_ci 66162306a36Sopenharmony_cistruct flush_kcq_data { 66262306a36Sopenharmony_ci struct kyber_hctx_data *khd; 66362306a36Sopenharmony_ci unsigned int sched_domain; 66462306a36Sopenharmony_ci struct list_head *list; 66562306a36Sopenharmony_ci}; 66662306a36Sopenharmony_ci 66762306a36Sopenharmony_cistatic bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data) 66862306a36Sopenharmony_ci{ 66962306a36Sopenharmony_ci struct flush_kcq_data *flush_data = data; 67062306a36Sopenharmony_ci struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr]; 67162306a36Sopenharmony_ci 67262306a36Sopenharmony_ci spin_lock(&kcq->lock); 67362306a36Sopenharmony_ci list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain], 67462306a36Sopenharmony_ci flush_data->list); 67562306a36Sopenharmony_ci sbitmap_clear_bit(sb, bitnr); 67662306a36Sopenharmony_ci spin_unlock(&kcq->lock); 67762306a36Sopenharmony_ci 67862306a36Sopenharmony_ci return true; 67962306a36Sopenharmony_ci} 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_cistatic void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd, 68262306a36Sopenharmony_ci unsigned int sched_domain, 68362306a36Sopenharmony_ci struct list_head *list) 68462306a36Sopenharmony_ci{ 68562306a36Sopenharmony_ci struct flush_kcq_data data = { 68662306a36Sopenharmony_ci .khd = khd, 68762306a36Sopenharmony_ci .sched_domain = sched_domain, 68862306a36Sopenharmony_ci .list = list, 68962306a36Sopenharmony_ci }; 69062306a36Sopenharmony_ci 69162306a36Sopenharmony_ci sbitmap_for_each_set(&khd->kcq_map[sched_domain], 69262306a36Sopenharmony_ci flush_busy_kcq, &data); 69362306a36Sopenharmony_ci} 69462306a36Sopenharmony_ci 69562306a36Sopenharmony_cistatic int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags, 69662306a36Sopenharmony_ci void *key) 69762306a36Sopenharmony_ci{ 69862306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private); 69962306a36Sopenharmony_ci struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait); 70062306a36Sopenharmony_ci 70162306a36Sopenharmony_ci sbitmap_del_wait_queue(wait); 70262306a36Sopenharmony_ci blk_mq_run_hw_queue(hctx, true); 70362306a36Sopenharmony_ci return 1; 70462306a36Sopenharmony_ci} 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_cistatic int kyber_get_domain_token(struct kyber_queue_data *kqd, 70762306a36Sopenharmony_ci struct kyber_hctx_data *khd, 70862306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx) 70962306a36Sopenharmony_ci{ 71062306a36Sopenharmony_ci unsigned int sched_domain = khd->cur_domain; 71162306a36Sopenharmony_ci struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; 71262306a36Sopenharmony_ci struct sbq_wait *wait = &khd->domain_wait[sched_domain]; 71362306a36Sopenharmony_ci struct sbq_wait_state *ws; 71462306a36Sopenharmony_ci int nr; 71562306a36Sopenharmony_ci 71662306a36Sopenharmony_ci nr = __sbitmap_queue_get(domain_tokens); 71762306a36Sopenharmony_ci 71862306a36Sopenharmony_ci /* 71962306a36Sopenharmony_ci * If we failed to get a domain token, make sure the hardware queue is 72062306a36Sopenharmony_ci * run when one becomes available. Note that this is serialized on 72162306a36Sopenharmony_ci * khd->lock, but we still need to be careful about the waker. 72262306a36Sopenharmony_ci */ 72362306a36Sopenharmony_ci if (nr < 0 && list_empty_careful(&wait->wait.entry)) { 72462306a36Sopenharmony_ci ws = sbq_wait_ptr(domain_tokens, 72562306a36Sopenharmony_ci &khd->wait_index[sched_domain]); 72662306a36Sopenharmony_ci khd->domain_ws[sched_domain] = ws; 72762306a36Sopenharmony_ci sbitmap_add_wait_queue(domain_tokens, ws, wait); 72862306a36Sopenharmony_ci 72962306a36Sopenharmony_ci /* 73062306a36Sopenharmony_ci * Try again in case a token was freed before we got on the wait 73162306a36Sopenharmony_ci * queue. 73262306a36Sopenharmony_ci */ 73362306a36Sopenharmony_ci nr = __sbitmap_queue_get(domain_tokens); 73462306a36Sopenharmony_ci } 73562306a36Sopenharmony_ci 73662306a36Sopenharmony_ci /* 73762306a36Sopenharmony_ci * If we got a token while we were on the wait queue, remove ourselves 73862306a36Sopenharmony_ci * from the wait queue to ensure that all wake ups make forward 73962306a36Sopenharmony_ci * progress. It's possible that the waker already deleted the entry 74062306a36Sopenharmony_ci * between the !list_empty_careful() check and us grabbing the lock, but 74162306a36Sopenharmony_ci * list_del_init() is okay with that. 74262306a36Sopenharmony_ci */ 74362306a36Sopenharmony_ci if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) { 74462306a36Sopenharmony_ci ws = khd->domain_ws[sched_domain]; 74562306a36Sopenharmony_ci spin_lock_irq(&ws->wait.lock); 74662306a36Sopenharmony_ci sbitmap_del_wait_queue(wait); 74762306a36Sopenharmony_ci spin_unlock_irq(&ws->wait.lock); 74862306a36Sopenharmony_ci } 74962306a36Sopenharmony_ci 75062306a36Sopenharmony_ci return nr; 75162306a36Sopenharmony_ci} 75262306a36Sopenharmony_ci 75362306a36Sopenharmony_cistatic struct request * 75462306a36Sopenharmony_cikyber_dispatch_cur_domain(struct kyber_queue_data *kqd, 75562306a36Sopenharmony_ci struct kyber_hctx_data *khd, 75662306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx) 75762306a36Sopenharmony_ci{ 75862306a36Sopenharmony_ci struct list_head *rqs; 75962306a36Sopenharmony_ci struct request *rq; 76062306a36Sopenharmony_ci int nr; 76162306a36Sopenharmony_ci 76262306a36Sopenharmony_ci rqs = &khd->rqs[khd->cur_domain]; 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_ci /* 76562306a36Sopenharmony_ci * If we already have a flushed request, then we just need to get a 76662306a36Sopenharmony_ci * token for it. Otherwise, if there are pending requests in the kcqs, 76762306a36Sopenharmony_ci * flush the kcqs, but only if we can get a token. If not, we should 76862306a36Sopenharmony_ci * leave the requests in the kcqs so that they can be merged. Note that 76962306a36Sopenharmony_ci * khd->lock serializes the flushes, so if we observed any bit set in 77062306a36Sopenharmony_ci * the kcq_map, we will always get a request. 77162306a36Sopenharmony_ci */ 77262306a36Sopenharmony_ci rq = list_first_entry_or_null(rqs, struct request, queuelist); 77362306a36Sopenharmony_ci if (rq) { 77462306a36Sopenharmony_ci nr = kyber_get_domain_token(kqd, khd, hctx); 77562306a36Sopenharmony_ci if (nr >= 0) { 77662306a36Sopenharmony_ci khd->batching++; 77762306a36Sopenharmony_ci rq_set_domain_token(rq, nr); 77862306a36Sopenharmony_ci list_del_init(&rq->queuelist); 77962306a36Sopenharmony_ci return rq; 78062306a36Sopenharmony_ci } else { 78162306a36Sopenharmony_ci trace_kyber_throttled(kqd->dev, 78262306a36Sopenharmony_ci kyber_domain_names[khd->cur_domain]); 78362306a36Sopenharmony_ci } 78462306a36Sopenharmony_ci } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { 78562306a36Sopenharmony_ci nr = kyber_get_domain_token(kqd, khd, hctx); 78662306a36Sopenharmony_ci if (nr >= 0) { 78762306a36Sopenharmony_ci kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs); 78862306a36Sopenharmony_ci rq = list_first_entry(rqs, struct request, queuelist); 78962306a36Sopenharmony_ci khd->batching++; 79062306a36Sopenharmony_ci rq_set_domain_token(rq, nr); 79162306a36Sopenharmony_ci list_del_init(&rq->queuelist); 79262306a36Sopenharmony_ci return rq; 79362306a36Sopenharmony_ci } else { 79462306a36Sopenharmony_ci trace_kyber_throttled(kqd->dev, 79562306a36Sopenharmony_ci kyber_domain_names[khd->cur_domain]); 79662306a36Sopenharmony_ci } 79762306a36Sopenharmony_ci } 79862306a36Sopenharmony_ci 79962306a36Sopenharmony_ci /* There were either no pending requests or no tokens. */ 80062306a36Sopenharmony_ci return NULL; 80162306a36Sopenharmony_ci} 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_cistatic struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) 80462306a36Sopenharmony_ci{ 80562306a36Sopenharmony_ci struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; 80662306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; 80762306a36Sopenharmony_ci struct request *rq; 80862306a36Sopenharmony_ci int i; 80962306a36Sopenharmony_ci 81062306a36Sopenharmony_ci spin_lock(&khd->lock); 81162306a36Sopenharmony_ci 81262306a36Sopenharmony_ci /* 81362306a36Sopenharmony_ci * First, if we are still entitled to batch, try to dispatch a request 81462306a36Sopenharmony_ci * from the batch. 81562306a36Sopenharmony_ci */ 81662306a36Sopenharmony_ci if (khd->batching < kyber_batch_size[khd->cur_domain]) { 81762306a36Sopenharmony_ci rq = kyber_dispatch_cur_domain(kqd, khd, hctx); 81862306a36Sopenharmony_ci if (rq) 81962306a36Sopenharmony_ci goto out; 82062306a36Sopenharmony_ci } 82162306a36Sopenharmony_ci 82262306a36Sopenharmony_ci /* 82362306a36Sopenharmony_ci * Either, 82462306a36Sopenharmony_ci * 1. We were no longer entitled to a batch. 82562306a36Sopenharmony_ci * 2. The domain we were batching didn't have any requests. 82662306a36Sopenharmony_ci * 3. The domain we were batching was out of tokens. 82762306a36Sopenharmony_ci * 82862306a36Sopenharmony_ci * Start another batch. Note that this wraps back around to the original 82962306a36Sopenharmony_ci * domain if no other domains have requests or tokens. 83062306a36Sopenharmony_ci */ 83162306a36Sopenharmony_ci khd->batching = 0; 83262306a36Sopenharmony_ci for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 83362306a36Sopenharmony_ci if (khd->cur_domain == KYBER_NUM_DOMAINS - 1) 83462306a36Sopenharmony_ci khd->cur_domain = 0; 83562306a36Sopenharmony_ci else 83662306a36Sopenharmony_ci khd->cur_domain++; 83762306a36Sopenharmony_ci 83862306a36Sopenharmony_ci rq = kyber_dispatch_cur_domain(kqd, khd, hctx); 83962306a36Sopenharmony_ci if (rq) 84062306a36Sopenharmony_ci goto out; 84162306a36Sopenharmony_ci } 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_ci rq = NULL; 84462306a36Sopenharmony_ciout: 84562306a36Sopenharmony_ci spin_unlock(&khd->lock); 84662306a36Sopenharmony_ci return rq; 84762306a36Sopenharmony_ci} 84862306a36Sopenharmony_ci 84962306a36Sopenharmony_cistatic bool kyber_has_work(struct blk_mq_hw_ctx *hctx) 85062306a36Sopenharmony_ci{ 85162306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; 85262306a36Sopenharmony_ci int i; 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 85562306a36Sopenharmony_ci if (!list_empty_careful(&khd->rqs[i]) || 85662306a36Sopenharmony_ci sbitmap_any_bit_set(&khd->kcq_map[i])) 85762306a36Sopenharmony_ci return true; 85862306a36Sopenharmony_ci } 85962306a36Sopenharmony_ci 86062306a36Sopenharmony_ci return false; 86162306a36Sopenharmony_ci} 86262306a36Sopenharmony_ci 86362306a36Sopenharmony_ci#define KYBER_LAT_SHOW_STORE(domain, name) \ 86462306a36Sopenharmony_cistatic ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \ 86562306a36Sopenharmony_ci char *page) \ 86662306a36Sopenharmony_ci{ \ 86762306a36Sopenharmony_ci struct kyber_queue_data *kqd = e->elevator_data; \ 86862306a36Sopenharmony_ci \ 86962306a36Sopenharmony_ci return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \ 87062306a36Sopenharmony_ci} \ 87162306a36Sopenharmony_ci \ 87262306a36Sopenharmony_cistatic ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \ 87362306a36Sopenharmony_ci const char *page, size_t count) \ 87462306a36Sopenharmony_ci{ \ 87562306a36Sopenharmony_ci struct kyber_queue_data *kqd = e->elevator_data; \ 87662306a36Sopenharmony_ci unsigned long long nsec; \ 87762306a36Sopenharmony_ci int ret; \ 87862306a36Sopenharmony_ci \ 87962306a36Sopenharmony_ci ret = kstrtoull(page, 10, &nsec); \ 88062306a36Sopenharmony_ci if (ret) \ 88162306a36Sopenharmony_ci return ret; \ 88262306a36Sopenharmony_ci \ 88362306a36Sopenharmony_ci kqd->latency_targets[domain] = nsec; \ 88462306a36Sopenharmony_ci \ 88562306a36Sopenharmony_ci return count; \ 88662306a36Sopenharmony_ci} 88762306a36Sopenharmony_ciKYBER_LAT_SHOW_STORE(KYBER_READ, read); 88862306a36Sopenharmony_ciKYBER_LAT_SHOW_STORE(KYBER_WRITE, write); 88962306a36Sopenharmony_ci#undef KYBER_LAT_SHOW_STORE 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_ci#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) 89262306a36Sopenharmony_cistatic struct elv_fs_entry kyber_sched_attrs[] = { 89362306a36Sopenharmony_ci KYBER_LAT_ATTR(read), 89462306a36Sopenharmony_ci KYBER_LAT_ATTR(write), 89562306a36Sopenharmony_ci __ATTR_NULL 89662306a36Sopenharmony_ci}; 89762306a36Sopenharmony_ci#undef KYBER_LAT_ATTR 89862306a36Sopenharmony_ci 89962306a36Sopenharmony_ci#ifdef CONFIG_BLK_DEBUG_FS 90062306a36Sopenharmony_ci#define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \ 90162306a36Sopenharmony_cistatic int kyber_##name##_tokens_show(void *data, struct seq_file *m) \ 90262306a36Sopenharmony_ci{ \ 90362306a36Sopenharmony_ci struct request_queue *q = data; \ 90462306a36Sopenharmony_ci struct kyber_queue_data *kqd = q->elevator->elevator_data; \ 90562306a36Sopenharmony_ci \ 90662306a36Sopenharmony_ci sbitmap_queue_show(&kqd->domain_tokens[domain], m); \ 90762306a36Sopenharmony_ci return 0; \ 90862306a36Sopenharmony_ci} \ 90962306a36Sopenharmony_ci \ 91062306a36Sopenharmony_cistatic void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \ 91162306a36Sopenharmony_ci __acquires(&khd->lock) \ 91262306a36Sopenharmony_ci{ \ 91362306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx = m->private; \ 91462306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; \ 91562306a36Sopenharmony_ci \ 91662306a36Sopenharmony_ci spin_lock(&khd->lock); \ 91762306a36Sopenharmony_ci return seq_list_start(&khd->rqs[domain], *pos); \ 91862306a36Sopenharmony_ci} \ 91962306a36Sopenharmony_ci \ 92062306a36Sopenharmony_cistatic void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \ 92162306a36Sopenharmony_ci loff_t *pos) \ 92262306a36Sopenharmony_ci{ \ 92362306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx = m->private; \ 92462306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; \ 92562306a36Sopenharmony_ci \ 92662306a36Sopenharmony_ci return seq_list_next(v, &khd->rqs[domain], pos); \ 92762306a36Sopenharmony_ci} \ 92862306a36Sopenharmony_ci \ 92962306a36Sopenharmony_cistatic void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \ 93062306a36Sopenharmony_ci __releases(&khd->lock) \ 93162306a36Sopenharmony_ci{ \ 93262306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx = m->private; \ 93362306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; \ 93462306a36Sopenharmony_ci \ 93562306a36Sopenharmony_ci spin_unlock(&khd->lock); \ 93662306a36Sopenharmony_ci} \ 93762306a36Sopenharmony_ci \ 93862306a36Sopenharmony_cistatic const struct seq_operations kyber_##name##_rqs_seq_ops = { \ 93962306a36Sopenharmony_ci .start = kyber_##name##_rqs_start, \ 94062306a36Sopenharmony_ci .next = kyber_##name##_rqs_next, \ 94162306a36Sopenharmony_ci .stop = kyber_##name##_rqs_stop, \ 94262306a36Sopenharmony_ci .show = blk_mq_debugfs_rq_show, \ 94362306a36Sopenharmony_ci}; \ 94462306a36Sopenharmony_ci \ 94562306a36Sopenharmony_cistatic int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ 94662306a36Sopenharmony_ci{ \ 94762306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx = data; \ 94862306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; \ 94962306a36Sopenharmony_ci wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \ 95062306a36Sopenharmony_ci \ 95162306a36Sopenharmony_ci seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ 95262306a36Sopenharmony_ci return 0; \ 95362306a36Sopenharmony_ci} 95462306a36Sopenharmony_ciKYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) 95562306a36Sopenharmony_ciKYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write) 95662306a36Sopenharmony_ciKYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard) 95762306a36Sopenharmony_ciKYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) 95862306a36Sopenharmony_ci#undef KYBER_DEBUGFS_DOMAIN_ATTRS 95962306a36Sopenharmony_ci 96062306a36Sopenharmony_cistatic int kyber_async_depth_show(void *data, struct seq_file *m) 96162306a36Sopenharmony_ci{ 96262306a36Sopenharmony_ci struct request_queue *q = data; 96362306a36Sopenharmony_ci struct kyber_queue_data *kqd = q->elevator->elevator_data; 96462306a36Sopenharmony_ci 96562306a36Sopenharmony_ci seq_printf(m, "%u\n", kqd->async_depth); 96662306a36Sopenharmony_ci return 0; 96762306a36Sopenharmony_ci} 96862306a36Sopenharmony_ci 96962306a36Sopenharmony_cistatic int kyber_cur_domain_show(void *data, struct seq_file *m) 97062306a36Sopenharmony_ci{ 97162306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx = data; 97262306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; 97362306a36Sopenharmony_ci 97462306a36Sopenharmony_ci seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]); 97562306a36Sopenharmony_ci return 0; 97662306a36Sopenharmony_ci} 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_cistatic int kyber_batching_show(void *data, struct seq_file *m) 97962306a36Sopenharmony_ci{ 98062306a36Sopenharmony_ci struct blk_mq_hw_ctx *hctx = data; 98162306a36Sopenharmony_ci struct kyber_hctx_data *khd = hctx->sched_data; 98262306a36Sopenharmony_ci 98362306a36Sopenharmony_ci seq_printf(m, "%u\n", khd->batching); 98462306a36Sopenharmony_ci return 0; 98562306a36Sopenharmony_ci} 98662306a36Sopenharmony_ci 98762306a36Sopenharmony_ci#define KYBER_QUEUE_DOMAIN_ATTRS(name) \ 98862306a36Sopenharmony_ci {#name "_tokens", 0400, kyber_##name##_tokens_show} 98962306a36Sopenharmony_cistatic const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { 99062306a36Sopenharmony_ci KYBER_QUEUE_DOMAIN_ATTRS(read), 99162306a36Sopenharmony_ci KYBER_QUEUE_DOMAIN_ATTRS(write), 99262306a36Sopenharmony_ci KYBER_QUEUE_DOMAIN_ATTRS(discard), 99362306a36Sopenharmony_ci KYBER_QUEUE_DOMAIN_ATTRS(other), 99462306a36Sopenharmony_ci {"async_depth", 0400, kyber_async_depth_show}, 99562306a36Sopenharmony_ci {}, 99662306a36Sopenharmony_ci}; 99762306a36Sopenharmony_ci#undef KYBER_QUEUE_DOMAIN_ATTRS 99862306a36Sopenharmony_ci 99962306a36Sopenharmony_ci#define KYBER_HCTX_DOMAIN_ATTRS(name) \ 100062306a36Sopenharmony_ci {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \ 100162306a36Sopenharmony_ci {#name "_waiting", 0400, kyber_##name##_waiting_show} 100262306a36Sopenharmony_cistatic const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { 100362306a36Sopenharmony_ci KYBER_HCTX_DOMAIN_ATTRS(read), 100462306a36Sopenharmony_ci KYBER_HCTX_DOMAIN_ATTRS(write), 100562306a36Sopenharmony_ci KYBER_HCTX_DOMAIN_ATTRS(discard), 100662306a36Sopenharmony_ci KYBER_HCTX_DOMAIN_ATTRS(other), 100762306a36Sopenharmony_ci {"cur_domain", 0400, kyber_cur_domain_show}, 100862306a36Sopenharmony_ci {"batching", 0400, kyber_batching_show}, 100962306a36Sopenharmony_ci {}, 101062306a36Sopenharmony_ci}; 101162306a36Sopenharmony_ci#undef KYBER_HCTX_DOMAIN_ATTRS 101262306a36Sopenharmony_ci#endif 101362306a36Sopenharmony_ci 101462306a36Sopenharmony_cistatic struct elevator_type kyber_sched = { 101562306a36Sopenharmony_ci .ops = { 101662306a36Sopenharmony_ci .init_sched = kyber_init_sched, 101762306a36Sopenharmony_ci .exit_sched = kyber_exit_sched, 101862306a36Sopenharmony_ci .init_hctx = kyber_init_hctx, 101962306a36Sopenharmony_ci .exit_hctx = kyber_exit_hctx, 102062306a36Sopenharmony_ci .limit_depth = kyber_limit_depth, 102162306a36Sopenharmony_ci .bio_merge = kyber_bio_merge, 102262306a36Sopenharmony_ci .prepare_request = kyber_prepare_request, 102362306a36Sopenharmony_ci .insert_requests = kyber_insert_requests, 102462306a36Sopenharmony_ci .finish_request = kyber_finish_request, 102562306a36Sopenharmony_ci .requeue_request = kyber_finish_request, 102662306a36Sopenharmony_ci .completed_request = kyber_completed_request, 102762306a36Sopenharmony_ci .dispatch_request = kyber_dispatch_request, 102862306a36Sopenharmony_ci .has_work = kyber_has_work, 102962306a36Sopenharmony_ci .depth_updated = kyber_depth_updated, 103062306a36Sopenharmony_ci }, 103162306a36Sopenharmony_ci#ifdef CONFIG_BLK_DEBUG_FS 103262306a36Sopenharmony_ci .queue_debugfs_attrs = kyber_queue_debugfs_attrs, 103362306a36Sopenharmony_ci .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, 103462306a36Sopenharmony_ci#endif 103562306a36Sopenharmony_ci .elevator_attrs = kyber_sched_attrs, 103662306a36Sopenharmony_ci .elevator_name = "kyber", 103762306a36Sopenharmony_ci .elevator_owner = THIS_MODULE, 103862306a36Sopenharmony_ci}; 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_cistatic int __init kyber_init(void) 104162306a36Sopenharmony_ci{ 104262306a36Sopenharmony_ci return elv_register(&kyber_sched); 104362306a36Sopenharmony_ci} 104462306a36Sopenharmony_ci 104562306a36Sopenharmony_cistatic void __exit kyber_exit(void) 104662306a36Sopenharmony_ci{ 104762306a36Sopenharmony_ci elv_unregister(&kyber_sched); 104862306a36Sopenharmony_ci} 104962306a36Sopenharmony_ci 105062306a36Sopenharmony_cimodule_init(kyber_init); 105162306a36Sopenharmony_cimodule_exit(kyber_exit); 105262306a36Sopenharmony_ci 105362306a36Sopenharmony_ciMODULE_AUTHOR("Omar Sandoval"); 105462306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 105562306a36Sopenharmony_ciMODULE_DESCRIPTION("Kyber I/O scheduler"); 1056