162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Block rq-qos base io controller 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * This works similar to wbt with a few exceptions 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * - It's bio based, so the latency covers the whole block layer in addition to 862306a36Sopenharmony_ci * the actual io. 962306a36Sopenharmony_ci * - We will throttle all IO that comes in here if we need to. 1062306a36Sopenharmony_ci * - We use the mean latency over the 100ms window. This is because writes can 1162306a36Sopenharmony_ci * be particularly fast, which could give us a false sense of the impact of 1262306a36Sopenharmony_ci * other workloads on our protected workload. 1362306a36Sopenharmony_ci * - By default there's no throttling, we set the queue_depth to UINT_MAX so 1462306a36Sopenharmony_ci * that we can have as many outstanding bio's as we're allowed to. Only at 1562306a36Sopenharmony_ci * throttle time do we pay attention to the actual queue depth. 1662306a36Sopenharmony_ci * 1762306a36Sopenharmony_ci * The hierarchy works like the cpu controller does, we track the latency at 1862306a36Sopenharmony_ci * every configured node, and each configured node has it's own independent 1962306a36Sopenharmony_ci * queue depth. This means that we only care about our latency targets at the 2062306a36Sopenharmony_ci * peer level. Some group at the bottom of the hierarchy isn't going to affect 2162306a36Sopenharmony_ci * a group at the end of some other path if we're only configred at leaf level. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * Consider the following 2462306a36Sopenharmony_ci * 2562306a36Sopenharmony_ci * root blkg 2662306a36Sopenharmony_ci * / \ 2762306a36Sopenharmony_ci * fast (target=5ms) slow (target=10ms) 2862306a36Sopenharmony_ci * / \ / \ 2962306a36Sopenharmony_ci * a b normal(15ms) unloved 3062306a36Sopenharmony_ci * 3162306a36Sopenharmony_ci * "a" and "b" have no target, but their combined io under "fast" cannot exceed 3262306a36Sopenharmony_ci * an average latency of 5ms. If it does then we will throttle the "slow" 3362306a36Sopenharmony_ci * group. In the case of "normal", if it exceeds its 15ms target, we will 3462306a36Sopenharmony_ci * throttle "unloved", but nobody else. 3562306a36Sopenharmony_ci * 3662306a36Sopenharmony_ci * In this example "fast", "slow", and "normal" will be the only groups actually 3762306a36Sopenharmony_ci * accounting their io latencies. We have to walk up the heirarchy to the root 3862306a36Sopenharmony_ci * on every submit and complete so we can do the appropriate stat recording and 3962306a36Sopenharmony_ci * adjust the queue depth of ourselves if needed. 4062306a36Sopenharmony_ci * 4162306a36Sopenharmony_ci * There are 2 ways we throttle IO. 4262306a36Sopenharmony_ci * 4362306a36Sopenharmony_ci * 1) Queue depth throttling. As we throttle down we will adjust the maximum 4462306a36Sopenharmony_ci * number of IO's we're allowed to have in flight. This starts at (u64)-1 down 4562306a36Sopenharmony_ci * to 1. If the group is only ever submitting IO for itself then this is the 4662306a36Sopenharmony_ci * only way we throttle. 4762306a36Sopenharmony_ci * 4862306a36Sopenharmony_ci * 2) Induced delay throttling. This is for the case that a group is generating 4962306a36Sopenharmony_ci * IO that has to be issued by the root cg to avoid priority inversion. So think 5062306a36Sopenharmony_ci * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot 5162306a36Sopenharmony_ci * of work done for us on behalf of the root cg and are being asked to scale 5262306a36Sopenharmony_ci * down more then we induce a latency at userspace return. We accumulate the 5362306a36Sopenharmony_ci * total amount of time we need to be punished by doing 5462306a36Sopenharmony_ci * 5562306a36Sopenharmony_ci * total_time += min_lat_nsec - actual_io_completion 5662306a36Sopenharmony_ci * 5762306a36Sopenharmony_ci * and then at throttle time will do 5862306a36Sopenharmony_ci * 5962306a36Sopenharmony_ci * throttle_time = min(total_time, NSEC_PER_SEC) 6062306a36Sopenharmony_ci * 6162306a36Sopenharmony_ci * This induced delay will throttle back the activity that is generating the 6262306a36Sopenharmony_ci * root cg issued io's, wethere that's some metadata intensive operation or the 6362306a36Sopenharmony_ci * group is using so much memory that it is pushing us into swap. 6462306a36Sopenharmony_ci * 6562306a36Sopenharmony_ci * Copyright (C) 2018 Josef Bacik 6662306a36Sopenharmony_ci */ 6762306a36Sopenharmony_ci#include <linux/kernel.h> 6862306a36Sopenharmony_ci#include <linux/blk_types.h> 6962306a36Sopenharmony_ci#include <linux/backing-dev.h> 7062306a36Sopenharmony_ci#include <linux/module.h> 7162306a36Sopenharmony_ci#include <linux/timer.h> 7262306a36Sopenharmony_ci#include <linux/memcontrol.h> 7362306a36Sopenharmony_ci#include <linux/sched/loadavg.h> 7462306a36Sopenharmony_ci#include <linux/sched/signal.h> 7562306a36Sopenharmony_ci#include <trace/events/block.h> 7662306a36Sopenharmony_ci#include <linux/blk-mq.h> 7762306a36Sopenharmony_ci#include "blk-rq-qos.h" 7862306a36Sopenharmony_ci#include "blk-stat.h" 7962306a36Sopenharmony_ci#include "blk-cgroup.h" 8062306a36Sopenharmony_ci#include "blk.h" 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci#define DEFAULT_SCALE_COOKIE 1000000U 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_cistatic struct blkcg_policy blkcg_policy_iolatency; 8562306a36Sopenharmony_cistruct iolatency_grp; 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_cistruct blk_iolatency { 8862306a36Sopenharmony_ci struct rq_qos rqos; 8962306a36Sopenharmony_ci struct timer_list timer; 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci /* 9262306a36Sopenharmony_ci * ->enabled is the master enable switch gating the throttling logic and 9362306a36Sopenharmony_ci * inflight tracking. The number of cgroups which have iolat enabled is 9462306a36Sopenharmony_ci * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly 9562306a36Sopenharmony_ci * from ->enable_work with the request_queue frozen. For details, See 9662306a36Sopenharmony_ci * blkiolatency_enable_work_fn(). 9762306a36Sopenharmony_ci */ 9862306a36Sopenharmony_ci bool enabled; 9962306a36Sopenharmony_ci atomic_t enable_cnt; 10062306a36Sopenharmony_ci struct work_struct enable_work; 10162306a36Sopenharmony_ci}; 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_cistatic inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) 10462306a36Sopenharmony_ci{ 10562306a36Sopenharmony_ci return container_of(rqos, struct blk_iolatency, rqos); 10662306a36Sopenharmony_ci} 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_cistruct child_latency_info { 10962306a36Sopenharmony_ci spinlock_t lock; 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci /* Last time we adjusted the scale of everybody. */ 11262306a36Sopenharmony_ci u64 last_scale_event; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci /* The latency that we missed. */ 11562306a36Sopenharmony_ci u64 scale_lat; 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci /* Total io's from all of our children for the last summation. */ 11862306a36Sopenharmony_ci u64 nr_samples; 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci /* The guy who actually changed the latency numbers. */ 12162306a36Sopenharmony_ci struct iolatency_grp *scale_grp; 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci /* Cookie to tell if we need to scale up or down. */ 12462306a36Sopenharmony_ci atomic_t scale_cookie; 12562306a36Sopenharmony_ci}; 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_cistruct percentile_stats { 12862306a36Sopenharmony_ci u64 total; 12962306a36Sopenharmony_ci u64 missed; 13062306a36Sopenharmony_ci}; 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_cistruct latency_stat { 13362306a36Sopenharmony_ci union { 13462306a36Sopenharmony_ci struct percentile_stats ps; 13562306a36Sopenharmony_ci struct blk_rq_stat rqs; 13662306a36Sopenharmony_ci }; 13762306a36Sopenharmony_ci}; 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_cistruct iolatency_grp { 14062306a36Sopenharmony_ci struct blkg_policy_data pd; 14162306a36Sopenharmony_ci struct latency_stat __percpu *stats; 14262306a36Sopenharmony_ci struct latency_stat cur_stat; 14362306a36Sopenharmony_ci struct blk_iolatency *blkiolat; 14462306a36Sopenharmony_ci unsigned int max_depth; 14562306a36Sopenharmony_ci struct rq_wait rq_wait; 14662306a36Sopenharmony_ci atomic64_t window_start; 14762306a36Sopenharmony_ci atomic_t scale_cookie; 14862306a36Sopenharmony_ci u64 min_lat_nsec; 14962306a36Sopenharmony_ci u64 cur_win_nsec; 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci /* total running average of our io latency. */ 15262306a36Sopenharmony_ci u64 lat_avg; 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci /* Our current number of IO's for the last summation. */ 15562306a36Sopenharmony_ci u64 nr_samples; 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci bool ssd; 15862306a36Sopenharmony_ci struct child_latency_info child_lat; 15962306a36Sopenharmony_ci}; 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC) 16262306a36Sopenharmony_ci#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC 16362306a36Sopenharmony_ci/* 16462306a36Sopenharmony_ci * These are the constants used to fake the fixed-point moving average 16562306a36Sopenharmony_ci * calculation just like load average. The call to calc_load() folds 16662306a36Sopenharmony_ci * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling 16762306a36Sopenharmony_ci * window size is bucketed to try to approximately calculate average 16862306a36Sopenharmony_ci * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows 16962306a36Sopenharmony_ci * elapse immediately. Note, windows only elapse with IO activity. Idle 17062306a36Sopenharmony_ci * periods extend the most recent window. 17162306a36Sopenharmony_ci */ 17262306a36Sopenharmony_ci#define BLKIOLATENCY_NR_EXP_FACTORS 5 17362306a36Sopenharmony_ci#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \ 17462306a36Sopenharmony_ci (BLKIOLATENCY_NR_EXP_FACTORS - 1)) 17562306a36Sopenharmony_cistatic const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = { 17662306a36Sopenharmony_ci 2045, // exp(1/600) - 600 samples 17762306a36Sopenharmony_ci 2039, // exp(1/240) - 240 samples 17862306a36Sopenharmony_ci 2031, // exp(1/120) - 120 samples 17962306a36Sopenharmony_ci 2023, // exp(1/80) - 80 samples 18062306a36Sopenharmony_ci 2014, // exp(1/60) - 60 samples 18162306a36Sopenharmony_ci}; 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_cistatic inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) 18462306a36Sopenharmony_ci{ 18562306a36Sopenharmony_ci return pd ? container_of(pd, struct iolatency_grp, pd) : NULL; 18662306a36Sopenharmony_ci} 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_cistatic inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg) 18962306a36Sopenharmony_ci{ 19062306a36Sopenharmony_ci return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency)); 19162306a36Sopenharmony_ci} 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_cistatic inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) 19462306a36Sopenharmony_ci{ 19562306a36Sopenharmony_ci return pd_to_blkg(&iolat->pd); 19662306a36Sopenharmony_ci} 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_cistatic inline void latency_stat_init(struct iolatency_grp *iolat, 19962306a36Sopenharmony_ci struct latency_stat *stat) 20062306a36Sopenharmony_ci{ 20162306a36Sopenharmony_ci if (iolat->ssd) { 20262306a36Sopenharmony_ci stat->ps.total = 0; 20362306a36Sopenharmony_ci stat->ps.missed = 0; 20462306a36Sopenharmony_ci } else 20562306a36Sopenharmony_ci blk_rq_stat_init(&stat->rqs); 20662306a36Sopenharmony_ci} 20762306a36Sopenharmony_ci 20862306a36Sopenharmony_cistatic inline void latency_stat_sum(struct iolatency_grp *iolat, 20962306a36Sopenharmony_ci struct latency_stat *sum, 21062306a36Sopenharmony_ci struct latency_stat *stat) 21162306a36Sopenharmony_ci{ 21262306a36Sopenharmony_ci if (iolat->ssd) { 21362306a36Sopenharmony_ci sum->ps.total += stat->ps.total; 21462306a36Sopenharmony_ci sum->ps.missed += stat->ps.missed; 21562306a36Sopenharmony_ci } else 21662306a36Sopenharmony_ci blk_rq_stat_sum(&sum->rqs, &stat->rqs); 21762306a36Sopenharmony_ci} 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_cistatic inline void latency_stat_record_time(struct iolatency_grp *iolat, 22062306a36Sopenharmony_ci u64 req_time) 22162306a36Sopenharmony_ci{ 22262306a36Sopenharmony_ci struct latency_stat *stat = get_cpu_ptr(iolat->stats); 22362306a36Sopenharmony_ci if (iolat->ssd) { 22462306a36Sopenharmony_ci if (req_time >= iolat->min_lat_nsec) 22562306a36Sopenharmony_ci stat->ps.missed++; 22662306a36Sopenharmony_ci stat->ps.total++; 22762306a36Sopenharmony_ci } else 22862306a36Sopenharmony_ci blk_rq_stat_add(&stat->rqs, req_time); 22962306a36Sopenharmony_ci put_cpu_ptr(stat); 23062306a36Sopenharmony_ci} 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_cistatic inline bool latency_sum_ok(struct iolatency_grp *iolat, 23362306a36Sopenharmony_ci struct latency_stat *stat) 23462306a36Sopenharmony_ci{ 23562306a36Sopenharmony_ci if (iolat->ssd) { 23662306a36Sopenharmony_ci u64 thresh = div64_u64(stat->ps.total, 10); 23762306a36Sopenharmony_ci thresh = max(thresh, 1ULL); 23862306a36Sopenharmony_ci return stat->ps.missed < thresh; 23962306a36Sopenharmony_ci } 24062306a36Sopenharmony_ci return stat->rqs.mean <= iolat->min_lat_nsec; 24162306a36Sopenharmony_ci} 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_cistatic inline u64 latency_stat_samples(struct iolatency_grp *iolat, 24462306a36Sopenharmony_ci struct latency_stat *stat) 24562306a36Sopenharmony_ci{ 24662306a36Sopenharmony_ci if (iolat->ssd) 24762306a36Sopenharmony_ci return stat->ps.total; 24862306a36Sopenharmony_ci return stat->rqs.nr_samples; 24962306a36Sopenharmony_ci} 25062306a36Sopenharmony_ci 25162306a36Sopenharmony_cistatic inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, 25262306a36Sopenharmony_ci struct latency_stat *stat) 25362306a36Sopenharmony_ci{ 25462306a36Sopenharmony_ci int exp_idx; 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci if (iolat->ssd) 25762306a36Sopenharmony_ci return; 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci /* 26062306a36Sopenharmony_ci * calc_load() takes in a number stored in fixed point representation. 26162306a36Sopenharmony_ci * Because we are using this for IO time in ns, the values stored 26262306a36Sopenharmony_ci * are significantly larger than the FIXED_1 denominator (2048). 26362306a36Sopenharmony_ci * Therefore, rounding errors in the calculation are negligible and 26462306a36Sopenharmony_ci * can be ignored. 26562306a36Sopenharmony_ci */ 26662306a36Sopenharmony_ci exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, 26762306a36Sopenharmony_ci div64_u64(iolat->cur_win_nsec, 26862306a36Sopenharmony_ci BLKIOLATENCY_EXP_BUCKET_SIZE)); 26962306a36Sopenharmony_ci iolat->lat_avg = calc_load(iolat->lat_avg, 27062306a36Sopenharmony_ci iolatency_exp_factors[exp_idx], 27162306a36Sopenharmony_ci stat->rqs.mean); 27262306a36Sopenharmony_ci} 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_cistatic void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) 27562306a36Sopenharmony_ci{ 27662306a36Sopenharmony_ci atomic_dec(&rqw->inflight); 27762306a36Sopenharmony_ci wake_up(&rqw->wait); 27862306a36Sopenharmony_ci} 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_cistatic bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) 28162306a36Sopenharmony_ci{ 28262306a36Sopenharmony_ci struct iolatency_grp *iolat = private_data; 28362306a36Sopenharmony_ci return rq_wait_inc_below(rqw, iolat->max_depth); 28462306a36Sopenharmony_ci} 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_cistatic void __blkcg_iolatency_throttle(struct rq_qos *rqos, 28762306a36Sopenharmony_ci struct iolatency_grp *iolat, 28862306a36Sopenharmony_ci bool issue_as_root, 28962306a36Sopenharmony_ci bool use_memdelay) 29062306a36Sopenharmony_ci{ 29162306a36Sopenharmony_ci struct rq_wait *rqw = &iolat->rq_wait; 29262306a36Sopenharmony_ci unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci if (use_delay) 29562306a36Sopenharmony_ci blkcg_schedule_throttle(rqos->disk, use_memdelay); 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci /* 29862306a36Sopenharmony_ci * To avoid priority inversions we want to just take a slot if we are 29962306a36Sopenharmony_ci * issuing as root. If we're being killed off there's no point in 30062306a36Sopenharmony_ci * delaying things, we may have been killed by OOM so throttling may 30162306a36Sopenharmony_ci * make recovery take even longer, so just let the IO's through so the 30262306a36Sopenharmony_ci * task can go away. 30362306a36Sopenharmony_ci */ 30462306a36Sopenharmony_ci if (issue_as_root || fatal_signal_pending(current)) { 30562306a36Sopenharmony_ci atomic_inc(&rqw->inflight); 30662306a36Sopenharmony_ci return; 30762306a36Sopenharmony_ci } 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); 31062306a36Sopenharmony_ci} 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_ci#define SCALE_DOWN_FACTOR 2 31362306a36Sopenharmony_ci#define SCALE_UP_FACTOR 4 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_cistatic inline unsigned long scale_amount(unsigned long qd, bool up) 31662306a36Sopenharmony_ci{ 31762306a36Sopenharmony_ci return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL); 31862306a36Sopenharmony_ci} 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci/* 32162306a36Sopenharmony_ci * We scale the qd down faster than we scale up, so we need to use this helper 32262306a36Sopenharmony_ci * to adjust the scale_cookie accordingly so we don't prematurely get 32362306a36Sopenharmony_ci * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much. 32462306a36Sopenharmony_ci * 32562306a36Sopenharmony_ci * Each group has their own local copy of the last scale cookie they saw, so if 32662306a36Sopenharmony_ci * the global scale cookie goes up or down they know which way they need to go 32762306a36Sopenharmony_ci * based on their last knowledge of it. 32862306a36Sopenharmony_ci */ 32962306a36Sopenharmony_cistatic void scale_cookie_change(struct blk_iolatency *blkiolat, 33062306a36Sopenharmony_ci struct child_latency_info *lat_info, 33162306a36Sopenharmony_ci bool up) 33262306a36Sopenharmony_ci{ 33362306a36Sopenharmony_ci unsigned long qd = blkiolat->rqos.disk->queue->nr_requests; 33462306a36Sopenharmony_ci unsigned long scale = scale_amount(qd, up); 33562306a36Sopenharmony_ci unsigned long old = atomic_read(&lat_info->scale_cookie); 33662306a36Sopenharmony_ci unsigned long max_scale = qd << 1; 33762306a36Sopenharmony_ci unsigned long diff = 0; 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci if (old < DEFAULT_SCALE_COOKIE) 34062306a36Sopenharmony_ci diff = DEFAULT_SCALE_COOKIE - old; 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci if (up) { 34362306a36Sopenharmony_ci if (scale + old > DEFAULT_SCALE_COOKIE) 34462306a36Sopenharmony_ci atomic_set(&lat_info->scale_cookie, 34562306a36Sopenharmony_ci DEFAULT_SCALE_COOKIE); 34662306a36Sopenharmony_ci else if (diff > qd) 34762306a36Sopenharmony_ci atomic_inc(&lat_info->scale_cookie); 34862306a36Sopenharmony_ci else 34962306a36Sopenharmony_ci atomic_add(scale, &lat_info->scale_cookie); 35062306a36Sopenharmony_ci } else { 35162306a36Sopenharmony_ci /* 35262306a36Sopenharmony_ci * We don't want to dig a hole so deep that it takes us hours to 35362306a36Sopenharmony_ci * dig out of it. Just enough that we don't throttle/unthrottle 35462306a36Sopenharmony_ci * with jagged workloads but can still unthrottle once pressure 35562306a36Sopenharmony_ci * has sufficiently dissipated. 35662306a36Sopenharmony_ci */ 35762306a36Sopenharmony_ci if (diff > qd) { 35862306a36Sopenharmony_ci if (diff < max_scale) 35962306a36Sopenharmony_ci atomic_dec(&lat_info->scale_cookie); 36062306a36Sopenharmony_ci } else { 36162306a36Sopenharmony_ci atomic_sub(scale, &lat_info->scale_cookie); 36262306a36Sopenharmony_ci } 36362306a36Sopenharmony_ci } 36462306a36Sopenharmony_ci} 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_ci/* 36762306a36Sopenharmony_ci * Change the queue depth of the iolatency_grp. We add 1/16th of the 36862306a36Sopenharmony_ci * queue depth at a time so we don't get wild swings and hopefully dial in to 36962306a36Sopenharmony_ci * fairer distribution of the overall queue depth. We halve the queue depth 37062306a36Sopenharmony_ci * at a time so we can scale down queue depth quickly from default unlimited 37162306a36Sopenharmony_ci * to target. 37262306a36Sopenharmony_ci */ 37362306a36Sopenharmony_cistatic void scale_change(struct iolatency_grp *iolat, bool up) 37462306a36Sopenharmony_ci{ 37562306a36Sopenharmony_ci unsigned long qd = iolat->blkiolat->rqos.disk->queue->nr_requests; 37662306a36Sopenharmony_ci unsigned long scale = scale_amount(qd, up); 37762306a36Sopenharmony_ci unsigned long old = iolat->max_depth; 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci if (old > qd) 38062306a36Sopenharmony_ci old = qd; 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ci if (up) { 38362306a36Sopenharmony_ci if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat))) 38462306a36Sopenharmony_ci return; 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci if (old < qd) { 38762306a36Sopenharmony_ci old += scale; 38862306a36Sopenharmony_ci old = min(old, qd); 38962306a36Sopenharmony_ci iolat->max_depth = old; 39062306a36Sopenharmony_ci wake_up_all(&iolat->rq_wait.wait); 39162306a36Sopenharmony_ci } 39262306a36Sopenharmony_ci } else { 39362306a36Sopenharmony_ci old >>= 1; 39462306a36Sopenharmony_ci iolat->max_depth = max(old, 1UL); 39562306a36Sopenharmony_ci } 39662306a36Sopenharmony_ci} 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_ci/* Check our parent and see if the scale cookie has changed. */ 39962306a36Sopenharmony_cistatic void check_scale_change(struct iolatency_grp *iolat) 40062306a36Sopenharmony_ci{ 40162306a36Sopenharmony_ci struct iolatency_grp *parent; 40262306a36Sopenharmony_ci struct child_latency_info *lat_info; 40362306a36Sopenharmony_ci unsigned int cur_cookie; 40462306a36Sopenharmony_ci unsigned int our_cookie = atomic_read(&iolat->scale_cookie); 40562306a36Sopenharmony_ci u64 scale_lat; 40662306a36Sopenharmony_ci int direction = 0; 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_ci parent = blkg_to_lat(lat_to_blkg(iolat)->parent); 40962306a36Sopenharmony_ci if (!parent) 41062306a36Sopenharmony_ci return; 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci lat_info = &parent->child_lat; 41362306a36Sopenharmony_ci cur_cookie = atomic_read(&lat_info->scale_cookie); 41462306a36Sopenharmony_ci scale_lat = READ_ONCE(lat_info->scale_lat); 41562306a36Sopenharmony_ci 41662306a36Sopenharmony_ci if (cur_cookie < our_cookie) 41762306a36Sopenharmony_ci direction = -1; 41862306a36Sopenharmony_ci else if (cur_cookie > our_cookie) 41962306a36Sopenharmony_ci direction = 1; 42062306a36Sopenharmony_ci else 42162306a36Sopenharmony_ci return; 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci if (!atomic_try_cmpxchg(&iolat->scale_cookie, &our_cookie, cur_cookie)) { 42462306a36Sopenharmony_ci /* Somebody beat us to the punch, just bail. */ 42562306a36Sopenharmony_ci return; 42662306a36Sopenharmony_ci } 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_ci if (direction < 0 && iolat->min_lat_nsec) { 42962306a36Sopenharmony_ci u64 samples_thresh; 43062306a36Sopenharmony_ci 43162306a36Sopenharmony_ci if (!scale_lat || iolat->min_lat_nsec <= scale_lat) 43262306a36Sopenharmony_ci return; 43362306a36Sopenharmony_ci 43462306a36Sopenharmony_ci /* 43562306a36Sopenharmony_ci * Sometimes high priority groups are their own worst enemy, so 43662306a36Sopenharmony_ci * instead of taking it out on some poor other group that did 5% 43762306a36Sopenharmony_ci * or less of the IO's for the last summation just skip this 43862306a36Sopenharmony_ci * scale down event. 43962306a36Sopenharmony_ci */ 44062306a36Sopenharmony_ci samples_thresh = lat_info->nr_samples * 5; 44162306a36Sopenharmony_ci samples_thresh = max(1ULL, div64_u64(samples_thresh, 100)); 44262306a36Sopenharmony_ci if (iolat->nr_samples <= samples_thresh) 44362306a36Sopenharmony_ci return; 44462306a36Sopenharmony_ci } 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_ci /* We're as low as we can go. */ 44762306a36Sopenharmony_ci if (iolat->max_depth == 1 && direction < 0) { 44862306a36Sopenharmony_ci blkcg_use_delay(lat_to_blkg(iolat)); 44962306a36Sopenharmony_ci return; 45062306a36Sopenharmony_ci } 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci /* We're back to the default cookie, unthrottle all the things. */ 45362306a36Sopenharmony_ci if (cur_cookie == DEFAULT_SCALE_COOKIE) { 45462306a36Sopenharmony_ci blkcg_clear_delay(lat_to_blkg(iolat)); 45562306a36Sopenharmony_ci iolat->max_depth = UINT_MAX; 45662306a36Sopenharmony_ci wake_up_all(&iolat->rq_wait.wait); 45762306a36Sopenharmony_ci return; 45862306a36Sopenharmony_ci } 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci scale_change(iolat, direction > 0); 46162306a36Sopenharmony_ci} 46262306a36Sopenharmony_ci 46362306a36Sopenharmony_cistatic void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) 46462306a36Sopenharmony_ci{ 46562306a36Sopenharmony_ci struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 46662306a36Sopenharmony_ci struct blkcg_gq *blkg = bio->bi_blkg; 46762306a36Sopenharmony_ci bool issue_as_root = bio_issue_as_root_blkg(bio); 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci if (!blkiolat->enabled) 47062306a36Sopenharmony_ci return; 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci while (blkg && blkg->parent) { 47362306a36Sopenharmony_ci struct iolatency_grp *iolat = blkg_to_lat(blkg); 47462306a36Sopenharmony_ci if (!iolat) { 47562306a36Sopenharmony_ci blkg = blkg->parent; 47662306a36Sopenharmony_ci continue; 47762306a36Sopenharmony_ci } 47862306a36Sopenharmony_ci 47962306a36Sopenharmony_ci check_scale_change(iolat); 48062306a36Sopenharmony_ci __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, 48162306a36Sopenharmony_ci (bio->bi_opf & REQ_SWAP) == REQ_SWAP); 48262306a36Sopenharmony_ci blkg = blkg->parent; 48362306a36Sopenharmony_ci } 48462306a36Sopenharmony_ci if (!timer_pending(&blkiolat->timer)) 48562306a36Sopenharmony_ci mod_timer(&blkiolat->timer, jiffies + HZ); 48662306a36Sopenharmony_ci} 48762306a36Sopenharmony_ci 48862306a36Sopenharmony_cistatic void iolatency_record_time(struct iolatency_grp *iolat, 48962306a36Sopenharmony_ci struct bio_issue *issue, u64 now, 49062306a36Sopenharmony_ci bool issue_as_root) 49162306a36Sopenharmony_ci{ 49262306a36Sopenharmony_ci u64 start = bio_issue_time(issue); 49362306a36Sopenharmony_ci u64 req_time; 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci /* 49662306a36Sopenharmony_ci * Have to do this so we are truncated to the correct time that our 49762306a36Sopenharmony_ci * issue is truncated to. 49862306a36Sopenharmony_ci */ 49962306a36Sopenharmony_ci now = __bio_issue_time(now); 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ci if (now <= start) 50262306a36Sopenharmony_ci return; 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_ci req_time = now - start; 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci /* 50762306a36Sopenharmony_ci * We don't want to count issue_as_root bio's in the cgroups latency 50862306a36Sopenharmony_ci * statistics as it could skew the numbers downwards. 50962306a36Sopenharmony_ci */ 51062306a36Sopenharmony_ci if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) { 51162306a36Sopenharmony_ci u64 sub = iolat->min_lat_nsec; 51262306a36Sopenharmony_ci if (req_time < sub) 51362306a36Sopenharmony_ci blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); 51462306a36Sopenharmony_ci return; 51562306a36Sopenharmony_ci } 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci latency_stat_record_time(iolat, req_time); 51862306a36Sopenharmony_ci} 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) 52162306a36Sopenharmony_ci#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_cistatic void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) 52462306a36Sopenharmony_ci{ 52562306a36Sopenharmony_ci struct blkcg_gq *blkg = lat_to_blkg(iolat); 52662306a36Sopenharmony_ci struct iolatency_grp *parent; 52762306a36Sopenharmony_ci struct child_latency_info *lat_info; 52862306a36Sopenharmony_ci struct latency_stat stat; 52962306a36Sopenharmony_ci unsigned long flags; 53062306a36Sopenharmony_ci int cpu; 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci latency_stat_init(iolat, &stat); 53362306a36Sopenharmony_ci preempt_disable(); 53462306a36Sopenharmony_ci for_each_online_cpu(cpu) { 53562306a36Sopenharmony_ci struct latency_stat *s; 53662306a36Sopenharmony_ci s = per_cpu_ptr(iolat->stats, cpu); 53762306a36Sopenharmony_ci latency_stat_sum(iolat, &stat, s); 53862306a36Sopenharmony_ci latency_stat_init(iolat, s); 53962306a36Sopenharmony_ci } 54062306a36Sopenharmony_ci preempt_enable(); 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci parent = blkg_to_lat(blkg->parent); 54362306a36Sopenharmony_ci if (!parent) 54462306a36Sopenharmony_ci return; 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_ci lat_info = &parent->child_lat; 54762306a36Sopenharmony_ci 54862306a36Sopenharmony_ci iolat_update_total_lat_avg(iolat, &stat); 54962306a36Sopenharmony_ci 55062306a36Sopenharmony_ci /* Everything is ok and we don't need to adjust the scale. */ 55162306a36Sopenharmony_ci if (latency_sum_ok(iolat, &stat) && 55262306a36Sopenharmony_ci atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) 55362306a36Sopenharmony_ci return; 55462306a36Sopenharmony_ci 55562306a36Sopenharmony_ci /* Somebody beat us to the punch, just bail. */ 55662306a36Sopenharmony_ci spin_lock_irqsave(&lat_info->lock, flags); 55762306a36Sopenharmony_ci 55862306a36Sopenharmony_ci latency_stat_sum(iolat, &iolat->cur_stat, &stat); 55962306a36Sopenharmony_ci lat_info->nr_samples -= iolat->nr_samples; 56062306a36Sopenharmony_ci lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat); 56162306a36Sopenharmony_ci iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat); 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_ci if ((lat_info->last_scale_event >= now || 56462306a36Sopenharmony_ci now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME)) 56562306a36Sopenharmony_ci goto out; 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_ci if (latency_sum_ok(iolat, &iolat->cur_stat) && 56862306a36Sopenharmony_ci latency_sum_ok(iolat, &stat)) { 56962306a36Sopenharmony_ci if (latency_stat_samples(iolat, &iolat->cur_stat) < 57062306a36Sopenharmony_ci BLKIOLATENCY_MIN_GOOD_SAMPLES) 57162306a36Sopenharmony_ci goto out; 57262306a36Sopenharmony_ci if (lat_info->scale_grp == iolat) { 57362306a36Sopenharmony_ci lat_info->last_scale_event = now; 57462306a36Sopenharmony_ci scale_cookie_change(iolat->blkiolat, lat_info, true); 57562306a36Sopenharmony_ci } 57662306a36Sopenharmony_ci } else if (lat_info->scale_lat == 0 || 57762306a36Sopenharmony_ci lat_info->scale_lat >= iolat->min_lat_nsec) { 57862306a36Sopenharmony_ci lat_info->last_scale_event = now; 57962306a36Sopenharmony_ci if (!lat_info->scale_grp || 58062306a36Sopenharmony_ci lat_info->scale_lat > iolat->min_lat_nsec) { 58162306a36Sopenharmony_ci WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec); 58262306a36Sopenharmony_ci lat_info->scale_grp = iolat; 58362306a36Sopenharmony_ci } 58462306a36Sopenharmony_ci scale_cookie_change(iolat->blkiolat, lat_info, false); 58562306a36Sopenharmony_ci } 58662306a36Sopenharmony_ci latency_stat_init(iolat, &iolat->cur_stat); 58762306a36Sopenharmony_ciout: 58862306a36Sopenharmony_ci spin_unlock_irqrestore(&lat_info->lock, flags); 58962306a36Sopenharmony_ci} 59062306a36Sopenharmony_ci 59162306a36Sopenharmony_cistatic void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) 59262306a36Sopenharmony_ci{ 59362306a36Sopenharmony_ci struct blkcg_gq *blkg; 59462306a36Sopenharmony_ci struct rq_wait *rqw; 59562306a36Sopenharmony_ci struct iolatency_grp *iolat; 59662306a36Sopenharmony_ci u64 window_start; 59762306a36Sopenharmony_ci u64 now; 59862306a36Sopenharmony_ci bool issue_as_root = bio_issue_as_root_blkg(bio); 59962306a36Sopenharmony_ci int inflight = 0; 60062306a36Sopenharmony_ci 60162306a36Sopenharmony_ci blkg = bio->bi_blkg; 60262306a36Sopenharmony_ci if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED)) 60362306a36Sopenharmony_ci return; 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_ci iolat = blkg_to_lat(bio->bi_blkg); 60662306a36Sopenharmony_ci if (!iolat) 60762306a36Sopenharmony_ci return; 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_ci if (!iolat->blkiolat->enabled) 61062306a36Sopenharmony_ci return; 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci now = ktime_to_ns(ktime_get()); 61362306a36Sopenharmony_ci while (blkg && blkg->parent) { 61462306a36Sopenharmony_ci iolat = blkg_to_lat(blkg); 61562306a36Sopenharmony_ci if (!iolat) { 61662306a36Sopenharmony_ci blkg = blkg->parent; 61762306a36Sopenharmony_ci continue; 61862306a36Sopenharmony_ci } 61962306a36Sopenharmony_ci rqw = &iolat->rq_wait; 62062306a36Sopenharmony_ci 62162306a36Sopenharmony_ci inflight = atomic_dec_return(&rqw->inflight); 62262306a36Sopenharmony_ci WARN_ON_ONCE(inflight < 0); 62362306a36Sopenharmony_ci /* 62462306a36Sopenharmony_ci * If bi_status is BLK_STS_AGAIN, the bio wasn't actually 62562306a36Sopenharmony_ci * submitted, so do not account for it. 62662306a36Sopenharmony_ci */ 62762306a36Sopenharmony_ci if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { 62862306a36Sopenharmony_ci iolatency_record_time(iolat, &bio->bi_issue, now, 62962306a36Sopenharmony_ci issue_as_root); 63062306a36Sopenharmony_ci window_start = atomic64_read(&iolat->window_start); 63162306a36Sopenharmony_ci if (now > window_start && 63262306a36Sopenharmony_ci (now - window_start) >= iolat->cur_win_nsec) { 63362306a36Sopenharmony_ci if (atomic64_try_cmpxchg(&iolat->window_start, 63462306a36Sopenharmony_ci &window_start, now)) 63562306a36Sopenharmony_ci iolatency_check_latencies(iolat, now); 63662306a36Sopenharmony_ci } 63762306a36Sopenharmony_ci } 63862306a36Sopenharmony_ci wake_up(&rqw->wait); 63962306a36Sopenharmony_ci blkg = blkg->parent; 64062306a36Sopenharmony_ci } 64162306a36Sopenharmony_ci} 64262306a36Sopenharmony_ci 64362306a36Sopenharmony_cistatic void blkcg_iolatency_exit(struct rq_qos *rqos) 64462306a36Sopenharmony_ci{ 64562306a36Sopenharmony_ci struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 64662306a36Sopenharmony_ci 64762306a36Sopenharmony_ci timer_shutdown_sync(&blkiolat->timer); 64862306a36Sopenharmony_ci flush_work(&blkiolat->enable_work); 64962306a36Sopenharmony_ci blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iolatency); 65062306a36Sopenharmony_ci kfree(blkiolat); 65162306a36Sopenharmony_ci} 65262306a36Sopenharmony_ci 65362306a36Sopenharmony_cistatic const struct rq_qos_ops blkcg_iolatency_ops = { 65462306a36Sopenharmony_ci .throttle = blkcg_iolatency_throttle, 65562306a36Sopenharmony_ci .done_bio = blkcg_iolatency_done_bio, 65662306a36Sopenharmony_ci .exit = blkcg_iolatency_exit, 65762306a36Sopenharmony_ci}; 65862306a36Sopenharmony_ci 65962306a36Sopenharmony_cistatic void blkiolatency_timer_fn(struct timer_list *t) 66062306a36Sopenharmony_ci{ 66162306a36Sopenharmony_ci struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); 66262306a36Sopenharmony_ci struct blkcg_gq *blkg; 66362306a36Sopenharmony_ci struct cgroup_subsys_state *pos_css; 66462306a36Sopenharmony_ci u64 now = ktime_to_ns(ktime_get()); 66562306a36Sopenharmony_ci 66662306a36Sopenharmony_ci rcu_read_lock(); 66762306a36Sopenharmony_ci blkg_for_each_descendant_pre(blkg, pos_css, 66862306a36Sopenharmony_ci blkiolat->rqos.disk->queue->root_blkg) { 66962306a36Sopenharmony_ci struct iolatency_grp *iolat; 67062306a36Sopenharmony_ci struct child_latency_info *lat_info; 67162306a36Sopenharmony_ci unsigned long flags; 67262306a36Sopenharmony_ci u64 cookie; 67362306a36Sopenharmony_ci 67462306a36Sopenharmony_ci /* 67562306a36Sopenharmony_ci * We could be exiting, don't access the pd unless we have a 67662306a36Sopenharmony_ci * ref on the blkg. 67762306a36Sopenharmony_ci */ 67862306a36Sopenharmony_ci if (!blkg_tryget(blkg)) 67962306a36Sopenharmony_ci continue; 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_ci iolat = blkg_to_lat(blkg); 68262306a36Sopenharmony_ci if (!iolat) 68362306a36Sopenharmony_ci goto next; 68462306a36Sopenharmony_ci 68562306a36Sopenharmony_ci lat_info = &iolat->child_lat; 68662306a36Sopenharmony_ci cookie = atomic_read(&lat_info->scale_cookie); 68762306a36Sopenharmony_ci 68862306a36Sopenharmony_ci if (cookie >= DEFAULT_SCALE_COOKIE) 68962306a36Sopenharmony_ci goto next; 69062306a36Sopenharmony_ci 69162306a36Sopenharmony_ci spin_lock_irqsave(&lat_info->lock, flags); 69262306a36Sopenharmony_ci if (lat_info->last_scale_event >= now) 69362306a36Sopenharmony_ci goto next_lock; 69462306a36Sopenharmony_ci 69562306a36Sopenharmony_ci /* 69662306a36Sopenharmony_ci * We scaled down but don't have a scale_grp, scale up and carry 69762306a36Sopenharmony_ci * on. 69862306a36Sopenharmony_ci */ 69962306a36Sopenharmony_ci if (lat_info->scale_grp == NULL) { 70062306a36Sopenharmony_ci scale_cookie_change(iolat->blkiolat, lat_info, true); 70162306a36Sopenharmony_ci goto next_lock; 70262306a36Sopenharmony_ci } 70362306a36Sopenharmony_ci 70462306a36Sopenharmony_ci /* 70562306a36Sopenharmony_ci * It's been 5 seconds since our last scale event, clear the 70662306a36Sopenharmony_ci * scale grp in case the group that needed the scale down isn't 70762306a36Sopenharmony_ci * doing any IO currently. 70862306a36Sopenharmony_ci */ 70962306a36Sopenharmony_ci if (now - lat_info->last_scale_event >= 71062306a36Sopenharmony_ci ((u64)NSEC_PER_SEC * 5)) 71162306a36Sopenharmony_ci lat_info->scale_grp = NULL; 71262306a36Sopenharmony_cinext_lock: 71362306a36Sopenharmony_ci spin_unlock_irqrestore(&lat_info->lock, flags); 71462306a36Sopenharmony_cinext: 71562306a36Sopenharmony_ci blkg_put(blkg); 71662306a36Sopenharmony_ci } 71762306a36Sopenharmony_ci rcu_read_unlock(); 71862306a36Sopenharmony_ci} 71962306a36Sopenharmony_ci 72062306a36Sopenharmony_ci/** 72162306a36Sopenharmony_ci * blkiolatency_enable_work_fn - Enable or disable iolatency on the device 72262306a36Sopenharmony_ci * @work: enable_work of the blk_iolatency of interest 72362306a36Sopenharmony_ci * 72462306a36Sopenharmony_ci * iolatency needs to keep track of the number of in-flight IOs per cgroup. This 72562306a36Sopenharmony_ci * is relatively expensive as it involves walking up the hierarchy twice for 72662306a36Sopenharmony_ci * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we 72762306a36Sopenharmony_ci * want to disable the in-flight tracking. 72862306a36Sopenharmony_ci * 72962306a36Sopenharmony_ci * We have to make sure that the counting is balanced - we don't want to leak 73062306a36Sopenharmony_ci * the in-flight counts by disabling accounting in the completion path while IOs 73162306a36Sopenharmony_ci * are in flight. This is achieved by ensuring that no IO is in flight by 73262306a36Sopenharmony_ci * freezing the queue while flipping ->enabled. As this requires a sleepable 73362306a36Sopenharmony_ci * context, ->enabled flipping is punted to this work function. 73462306a36Sopenharmony_ci */ 73562306a36Sopenharmony_cistatic void blkiolatency_enable_work_fn(struct work_struct *work) 73662306a36Sopenharmony_ci{ 73762306a36Sopenharmony_ci struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency, 73862306a36Sopenharmony_ci enable_work); 73962306a36Sopenharmony_ci bool enabled; 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci /* 74262306a36Sopenharmony_ci * There can only be one instance of this function running for @blkiolat 74362306a36Sopenharmony_ci * and it's guaranteed to be executed at least once after the latest 74462306a36Sopenharmony_ci * ->enabled_cnt modification. Acting on the latest ->enable_cnt is 74562306a36Sopenharmony_ci * sufficient. 74662306a36Sopenharmony_ci * 74762306a36Sopenharmony_ci * Also, we know @blkiolat is safe to access as ->enable_work is flushed 74862306a36Sopenharmony_ci * in blkcg_iolatency_exit(). 74962306a36Sopenharmony_ci */ 75062306a36Sopenharmony_ci enabled = atomic_read(&blkiolat->enable_cnt); 75162306a36Sopenharmony_ci if (enabled != blkiolat->enabled) { 75262306a36Sopenharmony_ci blk_mq_freeze_queue(blkiolat->rqos.disk->queue); 75362306a36Sopenharmony_ci blkiolat->enabled = enabled; 75462306a36Sopenharmony_ci blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue); 75562306a36Sopenharmony_ci } 75662306a36Sopenharmony_ci} 75762306a36Sopenharmony_ci 75862306a36Sopenharmony_cistatic int blk_iolatency_init(struct gendisk *disk) 75962306a36Sopenharmony_ci{ 76062306a36Sopenharmony_ci struct blk_iolatency *blkiolat; 76162306a36Sopenharmony_ci int ret; 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_ci blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); 76462306a36Sopenharmony_ci if (!blkiolat) 76562306a36Sopenharmony_ci return -ENOMEM; 76662306a36Sopenharmony_ci 76762306a36Sopenharmony_ci ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY, 76862306a36Sopenharmony_ci &blkcg_iolatency_ops); 76962306a36Sopenharmony_ci if (ret) 77062306a36Sopenharmony_ci goto err_free; 77162306a36Sopenharmony_ci ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency); 77262306a36Sopenharmony_ci if (ret) 77362306a36Sopenharmony_ci goto err_qos_del; 77462306a36Sopenharmony_ci 77562306a36Sopenharmony_ci timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); 77662306a36Sopenharmony_ci INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); 77762306a36Sopenharmony_ci 77862306a36Sopenharmony_ci return 0; 77962306a36Sopenharmony_ci 78062306a36Sopenharmony_cierr_qos_del: 78162306a36Sopenharmony_ci rq_qos_del(&blkiolat->rqos); 78262306a36Sopenharmony_cierr_free: 78362306a36Sopenharmony_ci kfree(blkiolat); 78462306a36Sopenharmony_ci return ret; 78562306a36Sopenharmony_ci} 78662306a36Sopenharmony_ci 78762306a36Sopenharmony_cistatic void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) 78862306a36Sopenharmony_ci{ 78962306a36Sopenharmony_ci struct iolatency_grp *iolat = blkg_to_lat(blkg); 79062306a36Sopenharmony_ci struct blk_iolatency *blkiolat = iolat->blkiolat; 79162306a36Sopenharmony_ci u64 oldval = iolat->min_lat_nsec; 79262306a36Sopenharmony_ci 79362306a36Sopenharmony_ci iolat->min_lat_nsec = val; 79462306a36Sopenharmony_ci iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE); 79562306a36Sopenharmony_ci iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, 79662306a36Sopenharmony_ci BLKIOLATENCY_MAX_WIN_SIZE); 79762306a36Sopenharmony_ci 79862306a36Sopenharmony_ci if (!oldval && val) { 79962306a36Sopenharmony_ci if (atomic_inc_return(&blkiolat->enable_cnt) == 1) 80062306a36Sopenharmony_ci schedule_work(&blkiolat->enable_work); 80162306a36Sopenharmony_ci } 80262306a36Sopenharmony_ci if (oldval && !val) { 80362306a36Sopenharmony_ci blkcg_clear_delay(blkg); 80462306a36Sopenharmony_ci if (atomic_dec_return(&blkiolat->enable_cnt) == 0) 80562306a36Sopenharmony_ci schedule_work(&blkiolat->enable_work); 80662306a36Sopenharmony_ci } 80762306a36Sopenharmony_ci} 80862306a36Sopenharmony_ci 80962306a36Sopenharmony_cistatic void iolatency_clear_scaling(struct blkcg_gq *blkg) 81062306a36Sopenharmony_ci{ 81162306a36Sopenharmony_ci if (blkg->parent) { 81262306a36Sopenharmony_ci struct iolatency_grp *iolat = blkg_to_lat(blkg->parent); 81362306a36Sopenharmony_ci struct child_latency_info *lat_info; 81462306a36Sopenharmony_ci if (!iolat) 81562306a36Sopenharmony_ci return; 81662306a36Sopenharmony_ci 81762306a36Sopenharmony_ci lat_info = &iolat->child_lat; 81862306a36Sopenharmony_ci spin_lock(&lat_info->lock); 81962306a36Sopenharmony_ci atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE); 82062306a36Sopenharmony_ci lat_info->last_scale_event = 0; 82162306a36Sopenharmony_ci lat_info->scale_grp = NULL; 82262306a36Sopenharmony_ci lat_info->scale_lat = 0; 82362306a36Sopenharmony_ci spin_unlock(&lat_info->lock); 82462306a36Sopenharmony_ci } 82562306a36Sopenharmony_ci} 82662306a36Sopenharmony_ci 82762306a36Sopenharmony_cistatic ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, 82862306a36Sopenharmony_ci size_t nbytes, loff_t off) 82962306a36Sopenharmony_ci{ 83062306a36Sopenharmony_ci struct blkcg *blkcg = css_to_blkcg(of_css(of)); 83162306a36Sopenharmony_ci struct blkcg_gq *blkg; 83262306a36Sopenharmony_ci struct blkg_conf_ctx ctx; 83362306a36Sopenharmony_ci struct iolatency_grp *iolat; 83462306a36Sopenharmony_ci char *p, *tok; 83562306a36Sopenharmony_ci u64 lat_val = 0; 83662306a36Sopenharmony_ci u64 oldval; 83762306a36Sopenharmony_ci int ret; 83862306a36Sopenharmony_ci 83962306a36Sopenharmony_ci blkg_conf_init(&ctx, buf); 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_ci ret = blkg_conf_open_bdev(&ctx); 84262306a36Sopenharmony_ci if (ret) 84362306a36Sopenharmony_ci goto out; 84462306a36Sopenharmony_ci 84562306a36Sopenharmony_ci /* 84662306a36Sopenharmony_ci * blk_iolatency_init() may fail after rq_qos_add() succeeds which can 84762306a36Sopenharmony_ci * confuse iolat_rq_qos() test. Make the test and init atomic. 84862306a36Sopenharmony_ci */ 84962306a36Sopenharmony_ci lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex); 85062306a36Sopenharmony_ci if (!iolat_rq_qos(ctx.bdev->bd_queue)) 85162306a36Sopenharmony_ci ret = blk_iolatency_init(ctx.bdev->bd_disk); 85262306a36Sopenharmony_ci if (ret) 85362306a36Sopenharmony_ci goto out; 85462306a36Sopenharmony_ci 85562306a36Sopenharmony_ci ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx); 85662306a36Sopenharmony_ci if (ret) 85762306a36Sopenharmony_ci goto out; 85862306a36Sopenharmony_ci 85962306a36Sopenharmony_ci iolat = blkg_to_lat(ctx.blkg); 86062306a36Sopenharmony_ci p = ctx.body; 86162306a36Sopenharmony_ci 86262306a36Sopenharmony_ci ret = -EINVAL; 86362306a36Sopenharmony_ci while ((tok = strsep(&p, " "))) { 86462306a36Sopenharmony_ci char key[16]; 86562306a36Sopenharmony_ci char val[21]; /* 18446744073709551616 */ 86662306a36Sopenharmony_ci 86762306a36Sopenharmony_ci if (sscanf(tok, "%15[^=]=%20s", key, val) != 2) 86862306a36Sopenharmony_ci goto out; 86962306a36Sopenharmony_ci 87062306a36Sopenharmony_ci if (!strcmp(key, "target")) { 87162306a36Sopenharmony_ci u64 v; 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci if (!strcmp(val, "max")) 87462306a36Sopenharmony_ci lat_val = 0; 87562306a36Sopenharmony_ci else if (sscanf(val, "%llu", &v) == 1) 87662306a36Sopenharmony_ci lat_val = v * NSEC_PER_USEC; 87762306a36Sopenharmony_ci else 87862306a36Sopenharmony_ci goto out; 87962306a36Sopenharmony_ci } else { 88062306a36Sopenharmony_ci goto out; 88162306a36Sopenharmony_ci } 88262306a36Sopenharmony_ci } 88362306a36Sopenharmony_ci 88462306a36Sopenharmony_ci /* Walk up the tree to see if our new val is lower than it should be. */ 88562306a36Sopenharmony_ci blkg = ctx.blkg; 88662306a36Sopenharmony_ci oldval = iolat->min_lat_nsec; 88762306a36Sopenharmony_ci 88862306a36Sopenharmony_ci iolatency_set_min_lat_nsec(blkg, lat_val); 88962306a36Sopenharmony_ci if (oldval != iolat->min_lat_nsec) 89062306a36Sopenharmony_ci iolatency_clear_scaling(blkg); 89162306a36Sopenharmony_ci ret = 0; 89262306a36Sopenharmony_ciout: 89362306a36Sopenharmony_ci blkg_conf_exit(&ctx); 89462306a36Sopenharmony_ci return ret ?: nbytes; 89562306a36Sopenharmony_ci} 89662306a36Sopenharmony_ci 89762306a36Sopenharmony_cistatic u64 iolatency_prfill_limit(struct seq_file *sf, 89862306a36Sopenharmony_ci struct blkg_policy_data *pd, int off) 89962306a36Sopenharmony_ci{ 90062306a36Sopenharmony_ci struct iolatency_grp *iolat = pd_to_lat(pd); 90162306a36Sopenharmony_ci const char *dname = blkg_dev_name(pd->blkg); 90262306a36Sopenharmony_ci 90362306a36Sopenharmony_ci if (!dname || !iolat->min_lat_nsec) 90462306a36Sopenharmony_ci return 0; 90562306a36Sopenharmony_ci seq_printf(sf, "%s target=%llu\n", 90662306a36Sopenharmony_ci dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC)); 90762306a36Sopenharmony_ci return 0; 90862306a36Sopenharmony_ci} 90962306a36Sopenharmony_ci 91062306a36Sopenharmony_cistatic int iolatency_print_limit(struct seq_file *sf, void *v) 91162306a36Sopenharmony_ci{ 91262306a36Sopenharmony_ci blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 91362306a36Sopenharmony_ci iolatency_prfill_limit, 91462306a36Sopenharmony_ci &blkcg_policy_iolatency, seq_cft(sf)->private, false); 91562306a36Sopenharmony_ci return 0; 91662306a36Sopenharmony_ci} 91762306a36Sopenharmony_ci 91862306a36Sopenharmony_cistatic void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) 91962306a36Sopenharmony_ci{ 92062306a36Sopenharmony_ci struct latency_stat stat; 92162306a36Sopenharmony_ci int cpu; 92262306a36Sopenharmony_ci 92362306a36Sopenharmony_ci latency_stat_init(iolat, &stat); 92462306a36Sopenharmony_ci preempt_disable(); 92562306a36Sopenharmony_ci for_each_online_cpu(cpu) { 92662306a36Sopenharmony_ci struct latency_stat *s; 92762306a36Sopenharmony_ci s = per_cpu_ptr(iolat->stats, cpu); 92862306a36Sopenharmony_ci latency_stat_sum(iolat, &stat, s); 92962306a36Sopenharmony_ci } 93062306a36Sopenharmony_ci preempt_enable(); 93162306a36Sopenharmony_ci 93262306a36Sopenharmony_ci if (iolat->max_depth == UINT_MAX) 93362306a36Sopenharmony_ci seq_printf(s, " missed=%llu total=%llu depth=max", 93462306a36Sopenharmony_ci (unsigned long long)stat.ps.missed, 93562306a36Sopenharmony_ci (unsigned long long)stat.ps.total); 93662306a36Sopenharmony_ci else 93762306a36Sopenharmony_ci seq_printf(s, " missed=%llu total=%llu depth=%u", 93862306a36Sopenharmony_ci (unsigned long long)stat.ps.missed, 93962306a36Sopenharmony_ci (unsigned long long)stat.ps.total, 94062306a36Sopenharmony_ci iolat->max_depth); 94162306a36Sopenharmony_ci} 94262306a36Sopenharmony_ci 94362306a36Sopenharmony_cistatic void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) 94462306a36Sopenharmony_ci{ 94562306a36Sopenharmony_ci struct iolatency_grp *iolat = pd_to_lat(pd); 94662306a36Sopenharmony_ci unsigned long long avg_lat; 94762306a36Sopenharmony_ci unsigned long long cur_win; 94862306a36Sopenharmony_ci 94962306a36Sopenharmony_ci if (!blkcg_debug_stats) 95062306a36Sopenharmony_ci return; 95162306a36Sopenharmony_ci 95262306a36Sopenharmony_ci if (iolat->ssd) 95362306a36Sopenharmony_ci return iolatency_ssd_stat(iolat, s); 95462306a36Sopenharmony_ci 95562306a36Sopenharmony_ci avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); 95662306a36Sopenharmony_ci cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); 95762306a36Sopenharmony_ci if (iolat->max_depth == UINT_MAX) 95862306a36Sopenharmony_ci seq_printf(s, " depth=max avg_lat=%llu win=%llu", 95962306a36Sopenharmony_ci avg_lat, cur_win); 96062306a36Sopenharmony_ci else 96162306a36Sopenharmony_ci seq_printf(s, " depth=%u avg_lat=%llu win=%llu", 96262306a36Sopenharmony_ci iolat->max_depth, avg_lat, cur_win); 96362306a36Sopenharmony_ci} 96462306a36Sopenharmony_ci 96562306a36Sopenharmony_cistatic struct blkg_policy_data *iolatency_pd_alloc(struct gendisk *disk, 96662306a36Sopenharmony_ci struct blkcg *blkcg, gfp_t gfp) 96762306a36Sopenharmony_ci{ 96862306a36Sopenharmony_ci struct iolatency_grp *iolat; 96962306a36Sopenharmony_ci 97062306a36Sopenharmony_ci iolat = kzalloc_node(sizeof(*iolat), gfp, disk->node_id); 97162306a36Sopenharmony_ci if (!iolat) 97262306a36Sopenharmony_ci return NULL; 97362306a36Sopenharmony_ci iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), 97462306a36Sopenharmony_ci __alignof__(struct latency_stat), gfp); 97562306a36Sopenharmony_ci if (!iolat->stats) { 97662306a36Sopenharmony_ci kfree(iolat); 97762306a36Sopenharmony_ci return NULL; 97862306a36Sopenharmony_ci } 97962306a36Sopenharmony_ci return &iolat->pd; 98062306a36Sopenharmony_ci} 98162306a36Sopenharmony_ci 98262306a36Sopenharmony_cistatic void iolatency_pd_init(struct blkg_policy_data *pd) 98362306a36Sopenharmony_ci{ 98462306a36Sopenharmony_ci struct iolatency_grp *iolat = pd_to_lat(pd); 98562306a36Sopenharmony_ci struct blkcg_gq *blkg = lat_to_blkg(iolat); 98662306a36Sopenharmony_ci struct rq_qos *rqos = iolat_rq_qos(blkg->q); 98762306a36Sopenharmony_ci struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 98862306a36Sopenharmony_ci u64 now = ktime_to_ns(ktime_get()); 98962306a36Sopenharmony_ci int cpu; 99062306a36Sopenharmony_ci 99162306a36Sopenharmony_ci if (blk_queue_nonrot(blkg->q)) 99262306a36Sopenharmony_ci iolat->ssd = true; 99362306a36Sopenharmony_ci else 99462306a36Sopenharmony_ci iolat->ssd = false; 99562306a36Sopenharmony_ci 99662306a36Sopenharmony_ci for_each_possible_cpu(cpu) { 99762306a36Sopenharmony_ci struct latency_stat *stat; 99862306a36Sopenharmony_ci stat = per_cpu_ptr(iolat->stats, cpu); 99962306a36Sopenharmony_ci latency_stat_init(iolat, stat); 100062306a36Sopenharmony_ci } 100162306a36Sopenharmony_ci 100262306a36Sopenharmony_ci latency_stat_init(iolat, &iolat->cur_stat); 100362306a36Sopenharmony_ci rq_wait_init(&iolat->rq_wait); 100462306a36Sopenharmony_ci spin_lock_init(&iolat->child_lat.lock); 100562306a36Sopenharmony_ci iolat->max_depth = UINT_MAX; 100662306a36Sopenharmony_ci iolat->blkiolat = blkiolat; 100762306a36Sopenharmony_ci iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; 100862306a36Sopenharmony_ci atomic64_set(&iolat->window_start, now); 100962306a36Sopenharmony_ci 101062306a36Sopenharmony_ci /* 101162306a36Sopenharmony_ci * We init things in list order, so the pd for the parent may not be 101262306a36Sopenharmony_ci * init'ed yet for whatever reason. 101362306a36Sopenharmony_ci */ 101462306a36Sopenharmony_ci if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) { 101562306a36Sopenharmony_ci struct iolatency_grp *parent = blkg_to_lat(blkg->parent); 101662306a36Sopenharmony_ci atomic_set(&iolat->scale_cookie, 101762306a36Sopenharmony_ci atomic_read(&parent->child_lat.scale_cookie)); 101862306a36Sopenharmony_ci } else { 101962306a36Sopenharmony_ci atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE); 102062306a36Sopenharmony_ci } 102162306a36Sopenharmony_ci 102262306a36Sopenharmony_ci atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE); 102362306a36Sopenharmony_ci} 102462306a36Sopenharmony_ci 102562306a36Sopenharmony_cistatic void iolatency_pd_offline(struct blkg_policy_data *pd) 102662306a36Sopenharmony_ci{ 102762306a36Sopenharmony_ci struct iolatency_grp *iolat = pd_to_lat(pd); 102862306a36Sopenharmony_ci struct blkcg_gq *blkg = lat_to_blkg(iolat); 102962306a36Sopenharmony_ci 103062306a36Sopenharmony_ci iolatency_set_min_lat_nsec(blkg, 0); 103162306a36Sopenharmony_ci iolatency_clear_scaling(blkg); 103262306a36Sopenharmony_ci} 103362306a36Sopenharmony_ci 103462306a36Sopenharmony_cistatic void iolatency_pd_free(struct blkg_policy_data *pd) 103562306a36Sopenharmony_ci{ 103662306a36Sopenharmony_ci struct iolatency_grp *iolat = pd_to_lat(pd); 103762306a36Sopenharmony_ci free_percpu(iolat->stats); 103862306a36Sopenharmony_ci kfree(iolat); 103962306a36Sopenharmony_ci} 104062306a36Sopenharmony_ci 104162306a36Sopenharmony_cistatic struct cftype iolatency_files[] = { 104262306a36Sopenharmony_ci { 104362306a36Sopenharmony_ci .name = "latency", 104462306a36Sopenharmony_ci .flags = CFTYPE_NOT_ON_ROOT, 104562306a36Sopenharmony_ci .seq_show = iolatency_print_limit, 104662306a36Sopenharmony_ci .write = iolatency_set_limit, 104762306a36Sopenharmony_ci }, 104862306a36Sopenharmony_ci {} 104962306a36Sopenharmony_ci}; 105062306a36Sopenharmony_ci 105162306a36Sopenharmony_cistatic struct blkcg_policy blkcg_policy_iolatency = { 105262306a36Sopenharmony_ci .dfl_cftypes = iolatency_files, 105362306a36Sopenharmony_ci .pd_alloc_fn = iolatency_pd_alloc, 105462306a36Sopenharmony_ci .pd_init_fn = iolatency_pd_init, 105562306a36Sopenharmony_ci .pd_offline_fn = iolatency_pd_offline, 105662306a36Sopenharmony_ci .pd_free_fn = iolatency_pd_free, 105762306a36Sopenharmony_ci .pd_stat_fn = iolatency_pd_stat, 105862306a36Sopenharmony_ci}; 105962306a36Sopenharmony_ci 106062306a36Sopenharmony_cistatic int __init iolatency_init(void) 106162306a36Sopenharmony_ci{ 106262306a36Sopenharmony_ci return blkcg_policy_register(&blkcg_policy_iolatency); 106362306a36Sopenharmony_ci} 106462306a36Sopenharmony_ci 106562306a36Sopenharmony_cistatic void __exit iolatency_exit(void) 106662306a36Sopenharmony_ci{ 106762306a36Sopenharmony_ci blkcg_policy_unregister(&blkcg_policy_iolatency); 106862306a36Sopenharmony_ci} 106962306a36Sopenharmony_ci 107062306a36Sopenharmony_cimodule_init(iolatency_init); 107162306a36Sopenharmony_cimodule_exit(iolatency_exit); 1072