162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Block rq-qos base io controller
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * This works similar to wbt with a few exceptions
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * - It's bio based, so the latency covers the whole block layer in addition to
862306a36Sopenharmony_ci *   the actual io.
962306a36Sopenharmony_ci * - We will throttle all IO that comes in here if we need to.
1062306a36Sopenharmony_ci * - We use the mean latency over the 100ms window.  This is because writes can
1162306a36Sopenharmony_ci *   be particularly fast, which could give us a false sense of the impact of
1262306a36Sopenharmony_ci *   other workloads on our protected workload.
1362306a36Sopenharmony_ci * - By default there's no throttling, we set the queue_depth to UINT_MAX so
1462306a36Sopenharmony_ci *   that we can have as many outstanding bio's as we're allowed to.  Only at
1562306a36Sopenharmony_ci *   throttle time do we pay attention to the actual queue depth.
1662306a36Sopenharmony_ci *
1762306a36Sopenharmony_ci * The hierarchy works like the cpu controller does, we track the latency at
1862306a36Sopenharmony_ci * every configured node, and each configured node has it's own independent
1962306a36Sopenharmony_ci * queue depth.  This means that we only care about our latency targets at the
2062306a36Sopenharmony_ci * peer level.  Some group at the bottom of the hierarchy isn't going to affect
2162306a36Sopenharmony_ci * a group at the end of some other path if we're only configred at leaf level.
2262306a36Sopenharmony_ci *
2362306a36Sopenharmony_ci * Consider the following
2462306a36Sopenharmony_ci *
2562306a36Sopenharmony_ci *                   root blkg
2662306a36Sopenharmony_ci *             /                     \
2762306a36Sopenharmony_ci *        fast (target=5ms)     slow (target=10ms)
2862306a36Sopenharmony_ci *         /     \                  /        \
2962306a36Sopenharmony_ci *       a        b          normal(15ms)   unloved
3062306a36Sopenharmony_ci *
3162306a36Sopenharmony_ci * "a" and "b" have no target, but their combined io under "fast" cannot exceed
3262306a36Sopenharmony_ci * an average latency of 5ms.  If it does then we will throttle the "slow"
3362306a36Sopenharmony_ci * group.  In the case of "normal", if it exceeds its 15ms target, we will
3462306a36Sopenharmony_ci * throttle "unloved", but nobody else.
3562306a36Sopenharmony_ci *
3662306a36Sopenharmony_ci * In this example "fast", "slow", and "normal" will be the only groups actually
3762306a36Sopenharmony_ci * accounting their io latencies.  We have to walk up the heirarchy to the root
3862306a36Sopenharmony_ci * on every submit and complete so we can do the appropriate stat recording and
3962306a36Sopenharmony_ci * adjust the queue depth of ourselves if needed.
4062306a36Sopenharmony_ci *
4162306a36Sopenharmony_ci * There are 2 ways we throttle IO.
4262306a36Sopenharmony_ci *
4362306a36Sopenharmony_ci * 1) Queue depth throttling.  As we throttle down we will adjust the maximum
4462306a36Sopenharmony_ci * number of IO's we're allowed to have in flight.  This starts at (u64)-1 down
4562306a36Sopenharmony_ci * to 1.  If the group is only ever submitting IO for itself then this is the
4662306a36Sopenharmony_ci * only way we throttle.
4762306a36Sopenharmony_ci *
4862306a36Sopenharmony_ci * 2) Induced delay throttling.  This is for the case that a group is generating
4962306a36Sopenharmony_ci * IO that has to be issued by the root cg to avoid priority inversion. So think
5062306a36Sopenharmony_ci * REQ_META or REQ_SWAP.  If we are already at qd == 1 and we're getting a lot
5162306a36Sopenharmony_ci * of work done for us on behalf of the root cg and are being asked to scale
5262306a36Sopenharmony_ci * down more then we induce a latency at userspace return.  We accumulate the
5362306a36Sopenharmony_ci * total amount of time we need to be punished by doing
5462306a36Sopenharmony_ci *
5562306a36Sopenharmony_ci * total_time += min_lat_nsec - actual_io_completion
5662306a36Sopenharmony_ci *
5762306a36Sopenharmony_ci * and then at throttle time will do
5862306a36Sopenharmony_ci *
5962306a36Sopenharmony_ci * throttle_time = min(total_time, NSEC_PER_SEC)
6062306a36Sopenharmony_ci *
6162306a36Sopenharmony_ci * This induced delay will throttle back the activity that is generating the
6262306a36Sopenharmony_ci * root cg issued io's, wethere that's some metadata intensive operation or the
6362306a36Sopenharmony_ci * group is using so much memory that it is pushing us into swap.
6462306a36Sopenharmony_ci *
6562306a36Sopenharmony_ci * Copyright (C) 2018 Josef Bacik
6662306a36Sopenharmony_ci */
6762306a36Sopenharmony_ci#include <linux/kernel.h>
6862306a36Sopenharmony_ci#include <linux/blk_types.h>
6962306a36Sopenharmony_ci#include <linux/backing-dev.h>
7062306a36Sopenharmony_ci#include <linux/module.h>
7162306a36Sopenharmony_ci#include <linux/timer.h>
7262306a36Sopenharmony_ci#include <linux/memcontrol.h>
7362306a36Sopenharmony_ci#include <linux/sched/loadavg.h>
7462306a36Sopenharmony_ci#include <linux/sched/signal.h>
7562306a36Sopenharmony_ci#include <trace/events/block.h>
7662306a36Sopenharmony_ci#include <linux/blk-mq.h>
7762306a36Sopenharmony_ci#include "blk-rq-qos.h"
7862306a36Sopenharmony_ci#include "blk-stat.h"
7962306a36Sopenharmony_ci#include "blk-cgroup.h"
8062306a36Sopenharmony_ci#include "blk.h"
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci#define DEFAULT_SCALE_COOKIE 1000000U
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_cistatic struct blkcg_policy blkcg_policy_iolatency;
8562306a36Sopenharmony_cistruct iolatency_grp;
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_cistruct blk_iolatency {
8862306a36Sopenharmony_ci	struct rq_qos rqos;
8962306a36Sopenharmony_ci	struct timer_list timer;
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	/*
9262306a36Sopenharmony_ci	 * ->enabled is the master enable switch gating the throttling logic and
9362306a36Sopenharmony_ci	 * inflight tracking. The number of cgroups which have iolat enabled is
9462306a36Sopenharmony_ci	 * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly
9562306a36Sopenharmony_ci	 * from ->enable_work with the request_queue frozen. For details, See
9662306a36Sopenharmony_ci	 * blkiolatency_enable_work_fn().
9762306a36Sopenharmony_ci	 */
9862306a36Sopenharmony_ci	bool enabled;
9962306a36Sopenharmony_ci	atomic_t enable_cnt;
10062306a36Sopenharmony_ci	struct work_struct enable_work;
10162306a36Sopenharmony_ci};
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_cistatic inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
10462306a36Sopenharmony_ci{
10562306a36Sopenharmony_ci	return container_of(rqos, struct blk_iolatency, rqos);
10662306a36Sopenharmony_ci}
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_cistruct child_latency_info {
10962306a36Sopenharmony_ci	spinlock_t lock;
11062306a36Sopenharmony_ci
11162306a36Sopenharmony_ci	/* Last time we adjusted the scale of everybody. */
11262306a36Sopenharmony_ci	u64 last_scale_event;
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci	/* The latency that we missed. */
11562306a36Sopenharmony_ci	u64 scale_lat;
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_ci	/* Total io's from all of our children for the last summation. */
11862306a36Sopenharmony_ci	u64 nr_samples;
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci	/* The guy who actually changed the latency numbers. */
12162306a36Sopenharmony_ci	struct iolatency_grp *scale_grp;
12262306a36Sopenharmony_ci
12362306a36Sopenharmony_ci	/* Cookie to tell if we need to scale up or down. */
12462306a36Sopenharmony_ci	atomic_t scale_cookie;
12562306a36Sopenharmony_ci};
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_cistruct percentile_stats {
12862306a36Sopenharmony_ci	u64 total;
12962306a36Sopenharmony_ci	u64 missed;
13062306a36Sopenharmony_ci};
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_cistruct latency_stat {
13362306a36Sopenharmony_ci	union {
13462306a36Sopenharmony_ci		struct percentile_stats ps;
13562306a36Sopenharmony_ci		struct blk_rq_stat rqs;
13662306a36Sopenharmony_ci	};
13762306a36Sopenharmony_ci};
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_cistruct iolatency_grp {
14062306a36Sopenharmony_ci	struct blkg_policy_data pd;
14162306a36Sopenharmony_ci	struct latency_stat __percpu *stats;
14262306a36Sopenharmony_ci	struct latency_stat cur_stat;
14362306a36Sopenharmony_ci	struct blk_iolatency *blkiolat;
14462306a36Sopenharmony_ci	unsigned int max_depth;
14562306a36Sopenharmony_ci	struct rq_wait rq_wait;
14662306a36Sopenharmony_ci	atomic64_t window_start;
14762306a36Sopenharmony_ci	atomic_t scale_cookie;
14862306a36Sopenharmony_ci	u64 min_lat_nsec;
14962306a36Sopenharmony_ci	u64 cur_win_nsec;
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	/* total running average of our io latency. */
15262306a36Sopenharmony_ci	u64 lat_avg;
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci	/* Our current number of IO's for the last summation. */
15562306a36Sopenharmony_ci	u64 nr_samples;
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci	bool ssd;
15862306a36Sopenharmony_ci	struct child_latency_info child_lat;
15962306a36Sopenharmony_ci};
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
16262306a36Sopenharmony_ci#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
16362306a36Sopenharmony_ci/*
16462306a36Sopenharmony_ci * These are the constants used to fake the fixed-point moving average
16562306a36Sopenharmony_ci * calculation just like load average.  The call to calc_load() folds
16662306a36Sopenharmony_ci * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg.  The sampling
16762306a36Sopenharmony_ci * window size is bucketed to try to approximately calculate average
16862306a36Sopenharmony_ci * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
16962306a36Sopenharmony_ci * elapse immediately.  Note, windows only elapse with IO activity.  Idle
17062306a36Sopenharmony_ci * periods extend the most recent window.
17162306a36Sopenharmony_ci */
17262306a36Sopenharmony_ci#define BLKIOLATENCY_NR_EXP_FACTORS 5
17362306a36Sopenharmony_ci#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
17462306a36Sopenharmony_ci				      (BLKIOLATENCY_NR_EXP_FACTORS - 1))
17562306a36Sopenharmony_cistatic const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
17662306a36Sopenharmony_ci	2045, // exp(1/600) - 600 samples
17762306a36Sopenharmony_ci	2039, // exp(1/240) - 240 samples
17862306a36Sopenharmony_ci	2031, // exp(1/120) - 120 samples
17962306a36Sopenharmony_ci	2023, // exp(1/80)  - 80 samples
18062306a36Sopenharmony_ci	2014, // exp(1/60)  - 60 samples
18162306a36Sopenharmony_ci};
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_cistatic inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
18462306a36Sopenharmony_ci{
18562306a36Sopenharmony_ci	return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
18662306a36Sopenharmony_ci}
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_cistatic inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg)
18962306a36Sopenharmony_ci{
19062306a36Sopenharmony_ci	return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency));
19162306a36Sopenharmony_ci}
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_cistatic inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
19462306a36Sopenharmony_ci{
19562306a36Sopenharmony_ci	return pd_to_blkg(&iolat->pd);
19662306a36Sopenharmony_ci}
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_cistatic inline void latency_stat_init(struct iolatency_grp *iolat,
19962306a36Sopenharmony_ci				     struct latency_stat *stat)
20062306a36Sopenharmony_ci{
20162306a36Sopenharmony_ci	if (iolat->ssd) {
20262306a36Sopenharmony_ci		stat->ps.total = 0;
20362306a36Sopenharmony_ci		stat->ps.missed = 0;
20462306a36Sopenharmony_ci	} else
20562306a36Sopenharmony_ci		blk_rq_stat_init(&stat->rqs);
20662306a36Sopenharmony_ci}
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_cistatic inline void latency_stat_sum(struct iolatency_grp *iolat,
20962306a36Sopenharmony_ci				    struct latency_stat *sum,
21062306a36Sopenharmony_ci				    struct latency_stat *stat)
21162306a36Sopenharmony_ci{
21262306a36Sopenharmony_ci	if (iolat->ssd) {
21362306a36Sopenharmony_ci		sum->ps.total += stat->ps.total;
21462306a36Sopenharmony_ci		sum->ps.missed += stat->ps.missed;
21562306a36Sopenharmony_ci	} else
21662306a36Sopenharmony_ci		blk_rq_stat_sum(&sum->rqs, &stat->rqs);
21762306a36Sopenharmony_ci}
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_cistatic inline void latency_stat_record_time(struct iolatency_grp *iolat,
22062306a36Sopenharmony_ci					    u64 req_time)
22162306a36Sopenharmony_ci{
22262306a36Sopenharmony_ci	struct latency_stat *stat = get_cpu_ptr(iolat->stats);
22362306a36Sopenharmony_ci	if (iolat->ssd) {
22462306a36Sopenharmony_ci		if (req_time >= iolat->min_lat_nsec)
22562306a36Sopenharmony_ci			stat->ps.missed++;
22662306a36Sopenharmony_ci		stat->ps.total++;
22762306a36Sopenharmony_ci	} else
22862306a36Sopenharmony_ci		blk_rq_stat_add(&stat->rqs, req_time);
22962306a36Sopenharmony_ci	put_cpu_ptr(stat);
23062306a36Sopenharmony_ci}
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_cistatic inline bool latency_sum_ok(struct iolatency_grp *iolat,
23362306a36Sopenharmony_ci				  struct latency_stat *stat)
23462306a36Sopenharmony_ci{
23562306a36Sopenharmony_ci	if (iolat->ssd) {
23662306a36Sopenharmony_ci		u64 thresh = div64_u64(stat->ps.total, 10);
23762306a36Sopenharmony_ci		thresh = max(thresh, 1ULL);
23862306a36Sopenharmony_ci		return stat->ps.missed < thresh;
23962306a36Sopenharmony_ci	}
24062306a36Sopenharmony_ci	return stat->rqs.mean <= iolat->min_lat_nsec;
24162306a36Sopenharmony_ci}
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_cistatic inline u64 latency_stat_samples(struct iolatency_grp *iolat,
24462306a36Sopenharmony_ci				       struct latency_stat *stat)
24562306a36Sopenharmony_ci{
24662306a36Sopenharmony_ci	if (iolat->ssd)
24762306a36Sopenharmony_ci		return stat->ps.total;
24862306a36Sopenharmony_ci	return stat->rqs.nr_samples;
24962306a36Sopenharmony_ci}
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_cistatic inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
25262306a36Sopenharmony_ci					      struct latency_stat *stat)
25362306a36Sopenharmony_ci{
25462306a36Sopenharmony_ci	int exp_idx;
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci	if (iolat->ssd)
25762306a36Sopenharmony_ci		return;
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	/*
26062306a36Sopenharmony_ci	 * calc_load() takes in a number stored in fixed point representation.
26162306a36Sopenharmony_ci	 * Because we are using this for IO time in ns, the values stored
26262306a36Sopenharmony_ci	 * are significantly larger than the FIXED_1 denominator (2048).
26362306a36Sopenharmony_ci	 * Therefore, rounding errors in the calculation are negligible and
26462306a36Sopenharmony_ci	 * can be ignored.
26562306a36Sopenharmony_ci	 */
26662306a36Sopenharmony_ci	exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
26762306a36Sopenharmony_ci			div64_u64(iolat->cur_win_nsec,
26862306a36Sopenharmony_ci				  BLKIOLATENCY_EXP_BUCKET_SIZE));
26962306a36Sopenharmony_ci	iolat->lat_avg = calc_load(iolat->lat_avg,
27062306a36Sopenharmony_ci				   iolatency_exp_factors[exp_idx],
27162306a36Sopenharmony_ci				   stat->rqs.mean);
27262306a36Sopenharmony_ci}
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_cistatic void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
27562306a36Sopenharmony_ci{
27662306a36Sopenharmony_ci	atomic_dec(&rqw->inflight);
27762306a36Sopenharmony_ci	wake_up(&rqw->wait);
27862306a36Sopenharmony_ci}
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_cistatic bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
28162306a36Sopenharmony_ci{
28262306a36Sopenharmony_ci	struct iolatency_grp *iolat = private_data;
28362306a36Sopenharmony_ci	return rq_wait_inc_below(rqw, iolat->max_depth);
28462306a36Sopenharmony_ci}
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_cistatic void __blkcg_iolatency_throttle(struct rq_qos *rqos,
28762306a36Sopenharmony_ci				       struct iolatency_grp *iolat,
28862306a36Sopenharmony_ci				       bool issue_as_root,
28962306a36Sopenharmony_ci				       bool use_memdelay)
29062306a36Sopenharmony_ci{
29162306a36Sopenharmony_ci	struct rq_wait *rqw = &iolat->rq_wait;
29262306a36Sopenharmony_ci	unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci	if (use_delay)
29562306a36Sopenharmony_ci		blkcg_schedule_throttle(rqos->disk, use_memdelay);
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	/*
29862306a36Sopenharmony_ci	 * To avoid priority inversions we want to just take a slot if we are
29962306a36Sopenharmony_ci	 * issuing as root.  If we're being killed off there's no point in
30062306a36Sopenharmony_ci	 * delaying things, we may have been killed by OOM so throttling may
30162306a36Sopenharmony_ci	 * make recovery take even longer, so just let the IO's through so the
30262306a36Sopenharmony_ci	 * task can go away.
30362306a36Sopenharmony_ci	 */
30462306a36Sopenharmony_ci	if (issue_as_root || fatal_signal_pending(current)) {
30562306a36Sopenharmony_ci		atomic_inc(&rqw->inflight);
30662306a36Sopenharmony_ci		return;
30762306a36Sopenharmony_ci	}
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci	rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb);
31062306a36Sopenharmony_ci}
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_ci#define SCALE_DOWN_FACTOR 2
31362306a36Sopenharmony_ci#define SCALE_UP_FACTOR 4
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_cistatic inline unsigned long scale_amount(unsigned long qd, bool up)
31662306a36Sopenharmony_ci{
31762306a36Sopenharmony_ci	return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL);
31862306a36Sopenharmony_ci}
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci/*
32162306a36Sopenharmony_ci * We scale the qd down faster than we scale up, so we need to use this helper
32262306a36Sopenharmony_ci * to adjust the scale_cookie accordingly so we don't prematurely get
32362306a36Sopenharmony_ci * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
32462306a36Sopenharmony_ci *
32562306a36Sopenharmony_ci * Each group has their own local copy of the last scale cookie they saw, so if
32662306a36Sopenharmony_ci * the global scale cookie goes up or down they know which way they need to go
32762306a36Sopenharmony_ci * based on their last knowledge of it.
32862306a36Sopenharmony_ci */
32962306a36Sopenharmony_cistatic void scale_cookie_change(struct blk_iolatency *blkiolat,
33062306a36Sopenharmony_ci				struct child_latency_info *lat_info,
33162306a36Sopenharmony_ci				bool up)
33262306a36Sopenharmony_ci{
33362306a36Sopenharmony_ci	unsigned long qd = blkiolat->rqos.disk->queue->nr_requests;
33462306a36Sopenharmony_ci	unsigned long scale = scale_amount(qd, up);
33562306a36Sopenharmony_ci	unsigned long old = atomic_read(&lat_info->scale_cookie);
33662306a36Sopenharmony_ci	unsigned long max_scale = qd << 1;
33762306a36Sopenharmony_ci	unsigned long diff = 0;
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	if (old < DEFAULT_SCALE_COOKIE)
34062306a36Sopenharmony_ci		diff = DEFAULT_SCALE_COOKIE - old;
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	if (up) {
34362306a36Sopenharmony_ci		if (scale + old > DEFAULT_SCALE_COOKIE)
34462306a36Sopenharmony_ci			atomic_set(&lat_info->scale_cookie,
34562306a36Sopenharmony_ci				   DEFAULT_SCALE_COOKIE);
34662306a36Sopenharmony_ci		else if (diff > qd)
34762306a36Sopenharmony_ci			atomic_inc(&lat_info->scale_cookie);
34862306a36Sopenharmony_ci		else
34962306a36Sopenharmony_ci			atomic_add(scale, &lat_info->scale_cookie);
35062306a36Sopenharmony_ci	} else {
35162306a36Sopenharmony_ci		/*
35262306a36Sopenharmony_ci		 * We don't want to dig a hole so deep that it takes us hours to
35362306a36Sopenharmony_ci		 * dig out of it.  Just enough that we don't throttle/unthrottle
35462306a36Sopenharmony_ci		 * with jagged workloads but can still unthrottle once pressure
35562306a36Sopenharmony_ci		 * has sufficiently dissipated.
35662306a36Sopenharmony_ci		 */
35762306a36Sopenharmony_ci		if (diff > qd) {
35862306a36Sopenharmony_ci			if (diff < max_scale)
35962306a36Sopenharmony_ci				atomic_dec(&lat_info->scale_cookie);
36062306a36Sopenharmony_ci		} else {
36162306a36Sopenharmony_ci			atomic_sub(scale, &lat_info->scale_cookie);
36262306a36Sopenharmony_ci		}
36362306a36Sopenharmony_ci	}
36462306a36Sopenharmony_ci}
36562306a36Sopenharmony_ci
36662306a36Sopenharmony_ci/*
36762306a36Sopenharmony_ci * Change the queue depth of the iolatency_grp.  We add 1/16th of the
36862306a36Sopenharmony_ci * queue depth at a time so we don't get wild swings and hopefully dial in to
36962306a36Sopenharmony_ci * fairer distribution of the overall queue depth.  We halve the queue depth
37062306a36Sopenharmony_ci * at a time so we can scale down queue depth quickly from default unlimited
37162306a36Sopenharmony_ci * to target.
37262306a36Sopenharmony_ci */
37362306a36Sopenharmony_cistatic void scale_change(struct iolatency_grp *iolat, bool up)
37462306a36Sopenharmony_ci{
37562306a36Sopenharmony_ci	unsigned long qd = iolat->blkiolat->rqos.disk->queue->nr_requests;
37662306a36Sopenharmony_ci	unsigned long scale = scale_amount(qd, up);
37762306a36Sopenharmony_ci	unsigned long old = iolat->max_depth;
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci	if (old > qd)
38062306a36Sopenharmony_ci		old = qd;
38162306a36Sopenharmony_ci
38262306a36Sopenharmony_ci	if (up) {
38362306a36Sopenharmony_ci		if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat)))
38462306a36Sopenharmony_ci			return;
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci		if (old < qd) {
38762306a36Sopenharmony_ci			old += scale;
38862306a36Sopenharmony_ci			old = min(old, qd);
38962306a36Sopenharmony_ci			iolat->max_depth = old;
39062306a36Sopenharmony_ci			wake_up_all(&iolat->rq_wait.wait);
39162306a36Sopenharmony_ci		}
39262306a36Sopenharmony_ci	} else {
39362306a36Sopenharmony_ci		old >>= 1;
39462306a36Sopenharmony_ci		iolat->max_depth = max(old, 1UL);
39562306a36Sopenharmony_ci	}
39662306a36Sopenharmony_ci}
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_ci/* Check our parent and see if the scale cookie has changed. */
39962306a36Sopenharmony_cistatic void check_scale_change(struct iolatency_grp *iolat)
40062306a36Sopenharmony_ci{
40162306a36Sopenharmony_ci	struct iolatency_grp *parent;
40262306a36Sopenharmony_ci	struct child_latency_info *lat_info;
40362306a36Sopenharmony_ci	unsigned int cur_cookie;
40462306a36Sopenharmony_ci	unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
40562306a36Sopenharmony_ci	u64 scale_lat;
40662306a36Sopenharmony_ci	int direction = 0;
40762306a36Sopenharmony_ci
40862306a36Sopenharmony_ci	parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
40962306a36Sopenharmony_ci	if (!parent)
41062306a36Sopenharmony_ci		return;
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_ci	lat_info = &parent->child_lat;
41362306a36Sopenharmony_ci	cur_cookie = atomic_read(&lat_info->scale_cookie);
41462306a36Sopenharmony_ci	scale_lat = READ_ONCE(lat_info->scale_lat);
41562306a36Sopenharmony_ci
41662306a36Sopenharmony_ci	if (cur_cookie < our_cookie)
41762306a36Sopenharmony_ci		direction = -1;
41862306a36Sopenharmony_ci	else if (cur_cookie > our_cookie)
41962306a36Sopenharmony_ci		direction = 1;
42062306a36Sopenharmony_ci	else
42162306a36Sopenharmony_ci		return;
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	if (!atomic_try_cmpxchg(&iolat->scale_cookie, &our_cookie, cur_cookie)) {
42462306a36Sopenharmony_ci		/* Somebody beat us to the punch, just bail. */
42562306a36Sopenharmony_ci		return;
42662306a36Sopenharmony_ci	}
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci	if (direction < 0 && iolat->min_lat_nsec) {
42962306a36Sopenharmony_ci		u64 samples_thresh;
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci		if (!scale_lat || iolat->min_lat_nsec <= scale_lat)
43262306a36Sopenharmony_ci			return;
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ci		/*
43562306a36Sopenharmony_ci		 * Sometimes high priority groups are their own worst enemy, so
43662306a36Sopenharmony_ci		 * instead of taking it out on some poor other group that did 5%
43762306a36Sopenharmony_ci		 * or less of the IO's for the last summation just skip this
43862306a36Sopenharmony_ci		 * scale down event.
43962306a36Sopenharmony_ci		 */
44062306a36Sopenharmony_ci		samples_thresh = lat_info->nr_samples * 5;
44162306a36Sopenharmony_ci		samples_thresh = max(1ULL, div64_u64(samples_thresh, 100));
44262306a36Sopenharmony_ci		if (iolat->nr_samples <= samples_thresh)
44362306a36Sopenharmony_ci			return;
44462306a36Sopenharmony_ci	}
44562306a36Sopenharmony_ci
44662306a36Sopenharmony_ci	/* We're as low as we can go. */
44762306a36Sopenharmony_ci	if (iolat->max_depth == 1 && direction < 0) {
44862306a36Sopenharmony_ci		blkcg_use_delay(lat_to_blkg(iolat));
44962306a36Sopenharmony_ci		return;
45062306a36Sopenharmony_ci	}
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci	/* We're back to the default cookie, unthrottle all the things. */
45362306a36Sopenharmony_ci	if (cur_cookie == DEFAULT_SCALE_COOKIE) {
45462306a36Sopenharmony_ci		blkcg_clear_delay(lat_to_blkg(iolat));
45562306a36Sopenharmony_ci		iolat->max_depth = UINT_MAX;
45662306a36Sopenharmony_ci		wake_up_all(&iolat->rq_wait.wait);
45762306a36Sopenharmony_ci		return;
45862306a36Sopenharmony_ci	}
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci	scale_change(iolat, direction > 0);
46162306a36Sopenharmony_ci}
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_cistatic void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
46462306a36Sopenharmony_ci{
46562306a36Sopenharmony_ci	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
46662306a36Sopenharmony_ci	struct blkcg_gq *blkg = bio->bi_blkg;
46762306a36Sopenharmony_ci	bool issue_as_root = bio_issue_as_root_blkg(bio);
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	if (!blkiolat->enabled)
47062306a36Sopenharmony_ci		return;
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_ci	while (blkg && blkg->parent) {
47362306a36Sopenharmony_ci		struct iolatency_grp *iolat = blkg_to_lat(blkg);
47462306a36Sopenharmony_ci		if (!iolat) {
47562306a36Sopenharmony_ci			blkg = blkg->parent;
47662306a36Sopenharmony_ci			continue;
47762306a36Sopenharmony_ci		}
47862306a36Sopenharmony_ci
47962306a36Sopenharmony_ci		check_scale_change(iolat);
48062306a36Sopenharmony_ci		__blkcg_iolatency_throttle(rqos, iolat, issue_as_root,
48162306a36Sopenharmony_ci				     (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
48262306a36Sopenharmony_ci		blkg = blkg->parent;
48362306a36Sopenharmony_ci	}
48462306a36Sopenharmony_ci	if (!timer_pending(&blkiolat->timer))
48562306a36Sopenharmony_ci		mod_timer(&blkiolat->timer, jiffies + HZ);
48662306a36Sopenharmony_ci}
48762306a36Sopenharmony_ci
48862306a36Sopenharmony_cistatic void iolatency_record_time(struct iolatency_grp *iolat,
48962306a36Sopenharmony_ci				  struct bio_issue *issue, u64 now,
49062306a36Sopenharmony_ci				  bool issue_as_root)
49162306a36Sopenharmony_ci{
49262306a36Sopenharmony_ci	u64 start = bio_issue_time(issue);
49362306a36Sopenharmony_ci	u64 req_time;
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_ci	/*
49662306a36Sopenharmony_ci	 * Have to do this so we are truncated to the correct time that our
49762306a36Sopenharmony_ci	 * issue is truncated to.
49862306a36Sopenharmony_ci	 */
49962306a36Sopenharmony_ci	now = __bio_issue_time(now);
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_ci	if (now <= start)
50262306a36Sopenharmony_ci		return;
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_ci	req_time = now - start;
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci	/*
50762306a36Sopenharmony_ci	 * We don't want to count issue_as_root bio's in the cgroups latency
50862306a36Sopenharmony_ci	 * statistics as it could skew the numbers downwards.
50962306a36Sopenharmony_ci	 */
51062306a36Sopenharmony_ci	if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) {
51162306a36Sopenharmony_ci		u64 sub = iolat->min_lat_nsec;
51262306a36Sopenharmony_ci		if (req_time < sub)
51362306a36Sopenharmony_ci			blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
51462306a36Sopenharmony_ci		return;
51562306a36Sopenharmony_ci	}
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci	latency_stat_record_time(iolat, req_time);
51862306a36Sopenharmony_ci}
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
52162306a36Sopenharmony_ci#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_cistatic void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
52462306a36Sopenharmony_ci{
52562306a36Sopenharmony_ci	struct blkcg_gq *blkg = lat_to_blkg(iolat);
52662306a36Sopenharmony_ci	struct iolatency_grp *parent;
52762306a36Sopenharmony_ci	struct child_latency_info *lat_info;
52862306a36Sopenharmony_ci	struct latency_stat stat;
52962306a36Sopenharmony_ci	unsigned long flags;
53062306a36Sopenharmony_ci	int cpu;
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci	latency_stat_init(iolat, &stat);
53362306a36Sopenharmony_ci	preempt_disable();
53462306a36Sopenharmony_ci	for_each_online_cpu(cpu) {
53562306a36Sopenharmony_ci		struct latency_stat *s;
53662306a36Sopenharmony_ci		s = per_cpu_ptr(iolat->stats, cpu);
53762306a36Sopenharmony_ci		latency_stat_sum(iolat, &stat, s);
53862306a36Sopenharmony_ci		latency_stat_init(iolat, s);
53962306a36Sopenharmony_ci	}
54062306a36Sopenharmony_ci	preempt_enable();
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci	parent = blkg_to_lat(blkg->parent);
54362306a36Sopenharmony_ci	if (!parent)
54462306a36Sopenharmony_ci		return;
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci	lat_info = &parent->child_lat;
54762306a36Sopenharmony_ci
54862306a36Sopenharmony_ci	iolat_update_total_lat_avg(iolat, &stat);
54962306a36Sopenharmony_ci
55062306a36Sopenharmony_ci	/* Everything is ok and we don't need to adjust the scale. */
55162306a36Sopenharmony_ci	if (latency_sum_ok(iolat, &stat) &&
55262306a36Sopenharmony_ci	    atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
55362306a36Sopenharmony_ci		return;
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_ci	/* Somebody beat us to the punch, just bail. */
55662306a36Sopenharmony_ci	spin_lock_irqsave(&lat_info->lock, flags);
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci	latency_stat_sum(iolat, &iolat->cur_stat, &stat);
55962306a36Sopenharmony_ci	lat_info->nr_samples -= iolat->nr_samples;
56062306a36Sopenharmony_ci	lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat);
56162306a36Sopenharmony_ci	iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat);
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci	if ((lat_info->last_scale_event >= now ||
56462306a36Sopenharmony_ci	    now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME))
56562306a36Sopenharmony_ci		goto out;
56662306a36Sopenharmony_ci
56762306a36Sopenharmony_ci	if (latency_sum_ok(iolat, &iolat->cur_stat) &&
56862306a36Sopenharmony_ci	    latency_sum_ok(iolat, &stat)) {
56962306a36Sopenharmony_ci		if (latency_stat_samples(iolat, &iolat->cur_stat) <
57062306a36Sopenharmony_ci		    BLKIOLATENCY_MIN_GOOD_SAMPLES)
57162306a36Sopenharmony_ci			goto out;
57262306a36Sopenharmony_ci		if (lat_info->scale_grp == iolat) {
57362306a36Sopenharmony_ci			lat_info->last_scale_event = now;
57462306a36Sopenharmony_ci			scale_cookie_change(iolat->blkiolat, lat_info, true);
57562306a36Sopenharmony_ci		}
57662306a36Sopenharmony_ci	} else if (lat_info->scale_lat == 0 ||
57762306a36Sopenharmony_ci		   lat_info->scale_lat >= iolat->min_lat_nsec) {
57862306a36Sopenharmony_ci		lat_info->last_scale_event = now;
57962306a36Sopenharmony_ci		if (!lat_info->scale_grp ||
58062306a36Sopenharmony_ci		    lat_info->scale_lat > iolat->min_lat_nsec) {
58162306a36Sopenharmony_ci			WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec);
58262306a36Sopenharmony_ci			lat_info->scale_grp = iolat;
58362306a36Sopenharmony_ci		}
58462306a36Sopenharmony_ci		scale_cookie_change(iolat->blkiolat, lat_info, false);
58562306a36Sopenharmony_ci	}
58662306a36Sopenharmony_ci	latency_stat_init(iolat, &iolat->cur_stat);
58762306a36Sopenharmony_ciout:
58862306a36Sopenharmony_ci	spin_unlock_irqrestore(&lat_info->lock, flags);
58962306a36Sopenharmony_ci}
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_cistatic void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
59262306a36Sopenharmony_ci{
59362306a36Sopenharmony_ci	struct blkcg_gq *blkg;
59462306a36Sopenharmony_ci	struct rq_wait *rqw;
59562306a36Sopenharmony_ci	struct iolatency_grp *iolat;
59662306a36Sopenharmony_ci	u64 window_start;
59762306a36Sopenharmony_ci	u64 now;
59862306a36Sopenharmony_ci	bool issue_as_root = bio_issue_as_root_blkg(bio);
59962306a36Sopenharmony_ci	int inflight = 0;
60062306a36Sopenharmony_ci
60162306a36Sopenharmony_ci	blkg = bio->bi_blkg;
60262306a36Sopenharmony_ci	if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED))
60362306a36Sopenharmony_ci		return;
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_ci	iolat = blkg_to_lat(bio->bi_blkg);
60662306a36Sopenharmony_ci	if (!iolat)
60762306a36Sopenharmony_ci		return;
60862306a36Sopenharmony_ci
60962306a36Sopenharmony_ci	if (!iolat->blkiolat->enabled)
61062306a36Sopenharmony_ci		return;
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	now = ktime_to_ns(ktime_get());
61362306a36Sopenharmony_ci	while (blkg && blkg->parent) {
61462306a36Sopenharmony_ci		iolat = blkg_to_lat(blkg);
61562306a36Sopenharmony_ci		if (!iolat) {
61662306a36Sopenharmony_ci			blkg = blkg->parent;
61762306a36Sopenharmony_ci			continue;
61862306a36Sopenharmony_ci		}
61962306a36Sopenharmony_ci		rqw = &iolat->rq_wait;
62062306a36Sopenharmony_ci
62162306a36Sopenharmony_ci		inflight = atomic_dec_return(&rqw->inflight);
62262306a36Sopenharmony_ci		WARN_ON_ONCE(inflight < 0);
62362306a36Sopenharmony_ci		/*
62462306a36Sopenharmony_ci		 * If bi_status is BLK_STS_AGAIN, the bio wasn't actually
62562306a36Sopenharmony_ci		 * submitted, so do not account for it.
62662306a36Sopenharmony_ci		 */
62762306a36Sopenharmony_ci		if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) {
62862306a36Sopenharmony_ci			iolatency_record_time(iolat, &bio->bi_issue, now,
62962306a36Sopenharmony_ci					      issue_as_root);
63062306a36Sopenharmony_ci			window_start = atomic64_read(&iolat->window_start);
63162306a36Sopenharmony_ci			if (now > window_start &&
63262306a36Sopenharmony_ci			    (now - window_start) >= iolat->cur_win_nsec) {
63362306a36Sopenharmony_ci				if (atomic64_try_cmpxchg(&iolat->window_start,
63462306a36Sopenharmony_ci							 &window_start, now))
63562306a36Sopenharmony_ci					iolatency_check_latencies(iolat, now);
63662306a36Sopenharmony_ci			}
63762306a36Sopenharmony_ci		}
63862306a36Sopenharmony_ci		wake_up(&rqw->wait);
63962306a36Sopenharmony_ci		blkg = blkg->parent;
64062306a36Sopenharmony_ci	}
64162306a36Sopenharmony_ci}
64262306a36Sopenharmony_ci
64362306a36Sopenharmony_cistatic void blkcg_iolatency_exit(struct rq_qos *rqos)
64462306a36Sopenharmony_ci{
64562306a36Sopenharmony_ci	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
64662306a36Sopenharmony_ci
64762306a36Sopenharmony_ci	timer_shutdown_sync(&blkiolat->timer);
64862306a36Sopenharmony_ci	flush_work(&blkiolat->enable_work);
64962306a36Sopenharmony_ci	blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iolatency);
65062306a36Sopenharmony_ci	kfree(blkiolat);
65162306a36Sopenharmony_ci}
65262306a36Sopenharmony_ci
65362306a36Sopenharmony_cistatic const struct rq_qos_ops blkcg_iolatency_ops = {
65462306a36Sopenharmony_ci	.throttle = blkcg_iolatency_throttle,
65562306a36Sopenharmony_ci	.done_bio = blkcg_iolatency_done_bio,
65662306a36Sopenharmony_ci	.exit = blkcg_iolatency_exit,
65762306a36Sopenharmony_ci};
65862306a36Sopenharmony_ci
65962306a36Sopenharmony_cistatic void blkiolatency_timer_fn(struct timer_list *t)
66062306a36Sopenharmony_ci{
66162306a36Sopenharmony_ci	struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
66262306a36Sopenharmony_ci	struct blkcg_gq *blkg;
66362306a36Sopenharmony_ci	struct cgroup_subsys_state *pos_css;
66462306a36Sopenharmony_ci	u64 now = ktime_to_ns(ktime_get());
66562306a36Sopenharmony_ci
66662306a36Sopenharmony_ci	rcu_read_lock();
66762306a36Sopenharmony_ci	blkg_for_each_descendant_pre(blkg, pos_css,
66862306a36Sopenharmony_ci				     blkiolat->rqos.disk->queue->root_blkg) {
66962306a36Sopenharmony_ci		struct iolatency_grp *iolat;
67062306a36Sopenharmony_ci		struct child_latency_info *lat_info;
67162306a36Sopenharmony_ci		unsigned long flags;
67262306a36Sopenharmony_ci		u64 cookie;
67362306a36Sopenharmony_ci
67462306a36Sopenharmony_ci		/*
67562306a36Sopenharmony_ci		 * We could be exiting, don't access the pd unless we have a
67662306a36Sopenharmony_ci		 * ref on the blkg.
67762306a36Sopenharmony_ci		 */
67862306a36Sopenharmony_ci		if (!blkg_tryget(blkg))
67962306a36Sopenharmony_ci			continue;
68062306a36Sopenharmony_ci
68162306a36Sopenharmony_ci		iolat = blkg_to_lat(blkg);
68262306a36Sopenharmony_ci		if (!iolat)
68362306a36Sopenharmony_ci			goto next;
68462306a36Sopenharmony_ci
68562306a36Sopenharmony_ci		lat_info = &iolat->child_lat;
68662306a36Sopenharmony_ci		cookie = atomic_read(&lat_info->scale_cookie);
68762306a36Sopenharmony_ci
68862306a36Sopenharmony_ci		if (cookie >= DEFAULT_SCALE_COOKIE)
68962306a36Sopenharmony_ci			goto next;
69062306a36Sopenharmony_ci
69162306a36Sopenharmony_ci		spin_lock_irqsave(&lat_info->lock, flags);
69262306a36Sopenharmony_ci		if (lat_info->last_scale_event >= now)
69362306a36Sopenharmony_ci			goto next_lock;
69462306a36Sopenharmony_ci
69562306a36Sopenharmony_ci		/*
69662306a36Sopenharmony_ci		 * We scaled down but don't have a scale_grp, scale up and carry
69762306a36Sopenharmony_ci		 * on.
69862306a36Sopenharmony_ci		 */
69962306a36Sopenharmony_ci		if (lat_info->scale_grp == NULL) {
70062306a36Sopenharmony_ci			scale_cookie_change(iolat->blkiolat, lat_info, true);
70162306a36Sopenharmony_ci			goto next_lock;
70262306a36Sopenharmony_ci		}
70362306a36Sopenharmony_ci
70462306a36Sopenharmony_ci		/*
70562306a36Sopenharmony_ci		 * It's been 5 seconds since our last scale event, clear the
70662306a36Sopenharmony_ci		 * scale grp in case the group that needed the scale down isn't
70762306a36Sopenharmony_ci		 * doing any IO currently.
70862306a36Sopenharmony_ci		 */
70962306a36Sopenharmony_ci		if (now - lat_info->last_scale_event >=
71062306a36Sopenharmony_ci		    ((u64)NSEC_PER_SEC * 5))
71162306a36Sopenharmony_ci			lat_info->scale_grp = NULL;
71262306a36Sopenharmony_cinext_lock:
71362306a36Sopenharmony_ci		spin_unlock_irqrestore(&lat_info->lock, flags);
71462306a36Sopenharmony_cinext:
71562306a36Sopenharmony_ci		blkg_put(blkg);
71662306a36Sopenharmony_ci	}
71762306a36Sopenharmony_ci	rcu_read_unlock();
71862306a36Sopenharmony_ci}
71962306a36Sopenharmony_ci
72062306a36Sopenharmony_ci/**
72162306a36Sopenharmony_ci * blkiolatency_enable_work_fn - Enable or disable iolatency on the device
72262306a36Sopenharmony_ci * @work: enable_work of the blk_iolatency of interest
72362306a36Sopenharmony_ci *
72462306a36Sopenharmony_ci * iolatency needs to keep track of the number of in-flight IOs per cgroup. This
72562306a36Sopenharmony_ci * is relatively expensive as it involves walking up the hierarchy twice for
72662306a36Sopenharmony_ci * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we
72762306a36Sopenharmony_ci * want to disable the in-flight tracking.
72862306a36Sopenharmony_ci *
72962306a36Sopenharmony_ci * We have to make sure that the counting is balanced - we don't want to leak
73062306a36Sopenharmony_ci * the in-flight counts by disabling accounting in the completion path while IOs
73162306a36Sopenharmony_ci * are in flight. This is achieved by ensuring that no IO is in flight by
73262306a36Sopenharmony_ci * freezing the queue while flipping ->enabled. As this requires a sleepable
73362306a36Sopenharmony_ci * context, ->enabled flipping is punted to this work function.
73462306a36Sopenharmony_ci */
73562306a36Sopenharmony_cistatic void blkiolatency_enable_work_fn(struct work_struct *work)
73662306a36Sopenharmony_ci{
73762306a36Sopenharmony_ci	struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency,
73862306a36Sopenharmony_ci						      enable_work);
73962306a36Sopenharmony_ci	bool enabled;
74062306a36Sopenharmony_ci
74162306a36Sopenharmony_ci	/*
74262306a36Sopenharmony_ci	 * There can only be one instance of this function running for @blkiolat
74362306a36Sopenharmony_ci	 * and it's guaranteed to be executed at least once after the latest
74462306a36Sopenharmony_ci	 * ->enabled_cnt modification. Acting on the latest ->enable_cnt is
74562306a36Sopenharmony_ci	 * sufficient.
74662306a36Sopenharmony_ci	 *
74762306a36Sopenharmony_ci	 * Also, we know @blkiolat is safe to access as ->enable_work is flushed
74862306a36Sopenharmony_ci	 * in blkcg_iolatency_exit().
74962306a36Sopenharmony_ci	 */
75062306a36Sopenharmony_ci	enabled = atomic_read(&blkiolat->enable_cnt);
75162306a36Sopenharmony_ci	if (enabled != blkiolat->enabled) {
75262306a36Sopenharmony_ci		blk_mq_freeze_queue(blkiolat->rqos.disk->queue);
75362306a36Sopenharmony_ci		blkiolat->enabled = enabled;
75462306a36Sopenharmony_ci		blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue);
75562306a36Sopenharmony_ci	}
75662306a36Sopenharmony_ci}
75762306a36Sopenharmony_ci
75862306a36Sopenharmony_cistatic int blk_iolatency_init(struct gendisk *disk)
75962306a36Sopenharmony_ci{
76062306a36Sopenharmony_ci	struct blk_iolatency *blkiolat;
76162306a36Sopenharmony_ci	int ret;
76262306a36Sopenharmony_ci
76362306a36Sopenharmony_ci	blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
76462306a36Sopenharmony_ci	if (!blkiolat)
76562306a36Sopenharmony_ci		return -ENOMEM;
76662306a36Sopenharmony_ci
76762306a36Sopenharmony_ci	ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY,
76862306a36Sopenharmony_ci			 &blkcg_iolatency_ops);
76962306a36Sopenharmony_ci	if (ret)
77062306a36Sopenharmony_ci		goto err_free;
77162306a36Sopenharmony_ci	ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency);
77262306a36Sopenharmony_ci	if (ret)
77362306a36Sopenharmony_ci		goto err_qos_del;
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci	timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
77662306a36Sopenharmony_ci	INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn);
77762306a36Sopenharmony_ci
77862306a36Sopenharmony_ci	return 0;
77962306a36Sopenharmony_ci
78062306a36Sopenharmony_cierr_qos_del:
78162306a36Sopenharmony_ci	rq_qos_del(&blkiolat->rqos);
78262306a36Sopenharmony_cierr_free:
78362306a36Sopenharmony_ci	kfree(blkiolat);
78462306a36Sopenharmony_ci	return ret;
78562306a36Sopenharmony_ci}
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_cistatic void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
78862306a36Sopenharmony_ci{
78962306a36Sopenharmony_ci	struct iolatency_grp *iolat = blkg_to_lat(blkg);
79062306a36Sopenharmony_ci	struct blk_iolatency *blkiolat = iolat->blkiolat;
79162306a36Sopenharmony_ci	u64 oldval = iolat->min_lat_nsec;
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci	iolat->min_lat_nsec = val;
79462306a36Sopenharmony_ci	iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE);
79562306a36Sopenharmony_ci	iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
79662306a36Sopenharmony_ci				    BLKIOLATENCY_MAX_WIN_SIZE);
79762306a36Sopenharmony_ci
79862306a36Sopenharmony_ci	if (!oldval && val) {
79962306a36Sopenharmony_ci		if (atomic_inc_return(&blkiolat->enable_cnt) == 1)
80062306a36Sopenharmony_ci			schedule_work(&blkiolat->enable_work);
80162306a36Sopenharmony_ci	}
80262306a36Sopenharmony_ci	if (oldval && !val) {
80362306a36Sopenharmony_ci		blkcg_clear_delay(blkg);
80462306a36Sopenharmony_ci		if (atomic_dec_return(&blkiolat->enable_cnt) == 0)
80562306a36Sopenharmony_ci			schedule_work(&blkiolat->enable_work);
80662306a36Sopenharmony_ci	}
80762306a36Sopenharmony_ci}
80862306a36Sopenharmony_ci
80962306a36Sopenharmony_cistatic void iolatency_clear_scaling(struct blkcg_gq *blkg)
81062306a36Sopenharmony_ci{
81162306a36Sopenharmony_ci	if (blkg->parent) {
81262306a36Sopenharmony_ci		struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
81362306a36Sopenharmony_ci		struct child_latency_info *lat_info;
81462306a36Sopenharmony_ci		if (!iolat)
81562306a36Sopenharmony_ci			return;
81662306a36Sopenharmony_ci
81762306a36Sopenharmony_ci		lat_info = &iolat->child_lat;
81862306a36Sopenharmony_ci		spin_lock(&lat_info->lock);
81962306a36Sopenharmony_ci		atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
82062306a36Sopenharmony_ci		lat_info->last_scale_event = 0;
82162306a36Sopenharmony_ci		lat_info->scale_grp = NULL;
82262306a36Sopenharmony_ci		lat_info->scale_lat = 0;
82362306a36Sopenharmony_ci		spin_unlock(&lat_info->lock);
82462306a36Sopenharmony_ci	}
82562306a36Sopenharmony_ci}
82662306a36Sopenharmony_ci
82762306a36Sopenharmony_cistatic ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
82862306a36Sopenharmony_ci			     size_t nbytes, loff_t off)
82962306a36Sopenharmony_ci{
83062306a36Sopenharmony_ci	struct blkcg *blkcg = css_to_blkcg(of_css(of));
83162306a36Sopenharmony_ci	struct blkcg_gq *blkg;
83262306a36Sopenharmony_ci	struct blkg_conf_ctx ctx;
83362306a36Sopenharmony_ci	struct iolatency_grp *iolat;
83462306a36Sopenharmony_ci	char *p, *tok;
83562306a36Sopenharmony_ci	u64 lat_val = 0;
83662306a36Sopenharmony_ci	u64 oldval;
83762306a36Sopenharmony_ci	int ret;
83862306a36Sopenharmony_ci
83962306a36Sopenharmony_ci	blkg_conf_init(&ctx, buf);
84062306a36Sopenharmony_ci
84162306a36Sopenharmony_ci	ret = blkg_conf_open_bdev(&ctx);
84262306a36Sopenharmony_ci	if (ret)
84362306a36Sopenharmony_ci		goto out;
84462306a36Sopenharmony_ci
84562306a36Sopenharmony_ci	/*
84662306a36Sopenharmony_ci	 * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
84762306a36Sopenharmony_ci	 * confuse iolat_rq_qos() test. Make the test and init atomic.
84862306a36Sopenharmony_ci	 */
84962306a36Sopenharmony_ci	lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex);
85062306a36Sopenharmony_ci	if (!iolat_rq_qos(ctx.bdev->bd_queue))
85162306a36Sopenharmony_ci		ret = blk_iolatency_init(ctx.bdev->bd_disk);
85262306a36Sopenharmony_ci	if (ret)
85362306a36Sopenharmony_ci		goto out;
85462306a36Sopenharmony_ci
85562306a36Sopenharmony_ci	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx);
85662306a36Sopenharmony_ci	if (ret)
85762306a36Sopenharmony_ci		goto out;
85862306a36Sopenharmony_ci
85962306a36Sopenharmony_ci	iolat = blkg_to_lat(ctx.blkg);
86062306a36Sopenharmony_ci	p = ctx.body;
86162306a36Sopenharmony_ci
86262306a36Sopenharmony_ci	ret = -EINVAL;
86362306a36Sopenharmony_ci	while ((tok = strsep(&p, " "))) {
86462306a36Sopenharmony_ci		char key[16];
86562306a36Sopenharmony_ci		char val[21];	/* 18446744073709551616 */
86662306a36Sopenharmony_ci
86762306a36Sopenharmony_ci		if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
86862306a36Sopenharmony_ci			goto out;
86962306a36Sopenharmony_ci
87062306a36Sopenharmony_ci		if (!strcmp(key, "target")) {
87162306a36Sopenharmony_ci			u64 v;
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci			if (!strcmp(val, "max"))
87462306a36Sopenharmony_ci				lat_val = 0;
87562306a36Sopenharmony_ci			else if (sscanf(val, "%llu", &v) == 1)
87662306a36Sopenharmony_ci				lat_val = v * NSEC_PER_USEC;
87762306a36Sopenharmony_ci			else
87862306a36Sopenharmony_ci				goto out;
87962306a36Sopenharmony_ci		} else {
88062306a36Sopenharmony_ci			goto out;
88162306a36Sopenharmony_ci		}
88262306a36Sopenharmony_ci	}
88362306a36Sopenharmony_ci
88462306a36Sopenharmony_ci	/* Walk up the tree to see if our new val is lower than it should be. */
88562306a36Sopenharmony_ci	blkg = ctx.blkg;
88662306a36Sopenharmony_ci	oldval = iolat->min_lat_nsec;
88762306a36Sopenharmony_ci
88862306a36Sopenharmony_ci	iolatency_set_min_lat_nsec(blkg, lat_val);
88962306a36Sopenharmony_ci	if (oldval != iolat->min_lat_nsec)
89062306a36Sopenharmony_ci		iolatency_clear_scaling(blkg);
89162306a36Sopenharmony_ci	ret = 0;
89262306a36Sopenharmony_ciout:
89362306a36Sopenharmony_ci	blkg_conf_exit(&ctx);
89462306a36Sopenharmony_ci	return ret ?: nbytes;
89562306a36Sopenharmony_ci}
89662306a36Sopenharmony_ci
89762306a36Sopenharmony_cistatic u64 iolatency_prfill_limit(struct seq_file *sf,
89862306a36Sopenharmony_ci				  struct blkg_policy_data *pd, int off)
89962306a36Sopenharmony_ci{
90062306a36Sopenharmony_ci	struct iolatency_grp *iolat = pd_to_lat(pd);
90162306a36Sopenharmony_ci	const char *dname = blkg_dev_name(pd->blkg);
90262306a36Sopenharmony_ci
90362306a36Sopenharmony_ci	if (!dname || !iolat->min_lat_nsec)
90462306a36Sopenharmony_ci		return 0;
90562306a36Sopenharmony_ci	seq_printf(sf, "%s target=%llu\n",
90662306a36Sopenharmony_ci		   dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC));
90762306a36Sopenharmony_ci	return 0;
90862306a36Sopenharmony_ci}
90962306a36Sopenharmony_ci
91062306a36Sopenharmony_cistatic int iolatency_print_limit(struct seq_file *sf, void *v)
91162306a36Sopenharmony_ci{
91262306a36Sopenharmony_ci	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
91362306a36Sopenharmony_ci			  iolatency_prfill_limit,
91462306a36Sopenharmony_ci			  &blkcg_policy_iolatency, seq_cft(sf)->private, false);
91562306a36Sopenharmony_ci	return 0;
91662306a36Sopenharmony_ci}
91762306a36Sopenharmony_ci
91862306a36Sopenharmony_cistatic void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
91962306a36Sopenharmony_ci{
92062306a36Sopenharmony_ci	struct latency_stat stat;
92162306a36Sopenharmony_ci	int cpu;
92262306a36Sopenharmony_ci
92362306a36Sopenharmony_ci	latency_stat_init(iolat, &stat);
92462306a36Sopenharmony_ci	preempt_disable();
92562306a36Sopenharmony_ci	for_each_online_cpu(cpu) {
92662306a36Sopenharmony_ci		struct latency_stat *s;
92762306a36Sopenharmony_ci		s = per_cpu_ptr(iolat->stats, cpu);
92862306a36Sopenharmony_ci		latency_stat_sum(iolat, &stat, s);
92962306a36Sopenharmony_ci	}
93062306a36Sopenharmony_ci	preempt_enable();
93162306a36Sopenharmony_ci
93262306a36Sopenharmony_ci	if (iolat->max_depth == UINT_MAX)
93362306a36Sopenharmony_ci		seq_printf(s, " missed=%llu total=%llu depth=max",
93462306a36Sopenharmony_ci			(unsigned long long)stat.ps.missed,
93562306a36Sopenharmony_ci			(unsigned long long)stat.ps.total);
93662306a36Sopenharmony_ci	else
93762306a36Sopenharmony_ci		seq_printf(s, " missed=%llu total=%llu depth=%u",
93862306a36Sopenharmony_ci			(unsigned long long)stat.ps.missed,
93962306a36Sopenharmony_ci			(unsigned long long)stat.ps.total,
94062306a36Sopenharmony_ci			iolat->max_depth);
94162306a36Sopenharmony_ci}
94262306a36Sopenharmony_ci
94362306a36Sopenharmony_cistatic void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
94462306a36Sopenharmony_ci{
94562306a36Sopenharmony_ci	struct iolatency_grp *iolat = pd_to_lat(pd);
94662306a36Sopenharmony_ci	unsigned long long avg_lat;
94762306a36Sopenharmony_ci	unsigned long long cur_win;
94862306a36Sopenharmony_ci
94962306a36Sopenharmony_ci	if (!blkcg_debug_stats)
95062306a36Sopenharmony_ci		return;
95162306a36Sopenharmony_ci
95262306a36Sopenharmony_ci	if (iolat->ssd)
95362306a36Sopenharmony_ci		return iolatency_ssd_stat(iolat, s);
95462306a36Sopenharmony_ci
95562306a36Sopenharmony_ci	avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
95662306a36Sopenharmony_ci	cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
95762306a36Sopenharmony_ci	if (iolat->max_depth == UINT_MAX)
95862306a36Sopenharmony_ci		seq_printf(s, " depth=max avg_lat=%llu win=%llu",
95962306a36Sopenharmony_ci			avg_lat, cur_win);
96062306a36Sopenharmony_ci	else
96162306a36Sopenharmony_ci		seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
96262306a36Sopenharmony_ci			iolat->max_depth, avg_lat, cur_win);
96362306a36Sopenharmony_ci}
96462306a36Sopenharmony_ci
96562306a36Sopenharmony_cistatic struct blkg_policy_data *iolatency_pd_alloc(struct gendisk *disk,
96662306a36Sopenharmony_ci		struct blkcg *blkcg, gfp_t gfp)
96762306a36Sopenharmony_ci{
96862306a36Sopenharmony_ci	struct iolatency_grp *iolat;
96962306a36Sopenharmony_ci
97062306a36Sopenharmony_ci	iolat = kzalloc_node(sizeof(*iolat), gfp, disk->node_id);
97162306a36Sopenharmony_ci	if (!iolat)
97262306a36Sopenharmony_ci		return NULL;
97362306a36Sopenharmony_ci	iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
97462306a36Sopenharmony_ci				       __alignof__(struct latency_stat), gfp);
97562306a36Sopenharmony_ci	if (!iolat->stats) {
97662306a36Sopenharmony_ci		kfree(iolat);
97762306a36Sopenharmony_ci		return NULL;
97862306a36Sopenharmony_ci	}
97962306a36Sopenharmony_ci	return &iolat->pd;
98062306a36Sopenharmony_ci}
98162306a36Sopenharmony_ci
98262306a36Sopenharmony_cistatic void iolatency_pd_init(struct blkg_policy_data *pd)
98362306a36Sopenharmony_ci{
98462306a36Sopenharmony_ci	struct iolatency_grp *iolat = pd_to_lat(pd);
98562306a36Sopenharmony_ci	struct blkcg_gq *blkg = lat_to_blkg(iolat);
98662306a36Sopenharmony_ci	struct rq_qos *rqos = iolat_rq_qos(blkg->q);
98762306a36Sopenharmony_ci	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
98862306a36Sopenharmony_ci	u64 now = ktime_to_ns(ktime_get());
98962306a36Sopenharmony_ci	int cpu;
99062306a36Sopenharmony_ci
99162306a36Sopenharmony_ci	if (blk_queue_nonrot(blkg->q))
99262306a36Sopenharmony_ci		iolat->ssd = true;
99362306a36Sopenharmony_ci	else
99462306a36Sopenharmony_ci		iolat->ssd = false;
99562306a36Sopenharmony_ci
99662306a36Sopenharmony_ci	for_each_possible_cpu(cpu) {
99762306a36Sopenharmony_ci		struct latency_stat *stat;
99862306a36Sopenharmony_ci		stat = per_cpu_ptr(iolat->stats, cpu);
99962306a36Sopenharmony_ci		latency_stat_init(iolat, stat);
100062306a36Sopenharmony_ci	}
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_ci	latency_stat_init(iolat, &iolat->cur_stat);
100362306a36Sopenharmony_ci	rq_wait_init(&iolat->rq_wait);
100462306a36Sopenharmony_ci	spin_lock_init(&iolat->child_lat.lock);
100562306a36Sopenharmony_ci	iolat->max_depth = UINT_MAX;
100662306a36Sopenharmony_ci	iolat->blkiolat = blkiolat;
100762306a36Sopenharmony_ci	iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
100862306a36Sopenharmony_ci	atomic64_set(&iolat->window_start, now);
100962306a36Sopenharmony_ci
101062306a36Sopenharmony_ci	/*
101162306a36Sopenharmony_ci	 * We init things in list order, so the pd for the parent may not be
101262306a36Sopenharmony_ci	 * init'ed yet for whatever reason.
101362306a36Sopenharmony_ci	 */
101462306a36Sopenharmony_ci	if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) {
101562306a36Sopenharmony_ci		struct iolatency_grp *parent = blkg_to_lat(blkg->parent);
101662306a36Sopenharmony_ci		atomic_set(&iolat->scale_cookie,
101762306a36Sopenharmony_ci			   atomic_read(&parent->child_lat.scale_cookie));
101862306a36Sopenharmony_ci	} else {
101962306a36Sopenharmony_ci		atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE);
102062306a36Sopenharmony_ci	}
102162306a36Sopenharmony_ci
102262306a36Sopenharmony_ci	atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE);
102362306a36Sopenharmony_ci}
102462306a36Sopenharmony_ci
102562306a36Sopenharmony_cistatic void iolatency_pd_offline(struct blkg_policy_data *pd)
102662306a36Sopenharmony_ci{
102762306a36Sopenharmony_ci	struct iolatency_grp *iolat = pd_to_lat(pd);
102862306a36Sopenharmony_ci	struct blkcg_gq *blkg = lat_to_blkg(iolat);
102962306a36Sopenharmony_ci
103062306a36Sopenharmony_ci	iolatency_set_min_lat_nsec(blkg, 0);
103162306a36Sopenharmony_ci	iolatency_clear_scaling(blkg);
103262306a36Sopenharmony_ci}
103362306a36Sopenharmony_ci
103462306a36Sopenharmony_cistatic void iolatency_pd_free(struct blkg_policy_data *pd)
103562306a36Sopenharmony_ci{
103662306a36Sopenharmony_ci	struct iolatency_grp *iolat = pd_to_lat(pd);
103762306a36Sopenharmony_ci	free_percpu(iolat->stats);
103862306a36Sopenharmony_ci	kfree(iolat);
103962306a36Sopenharmony_ci}
104062306a36Sopenharmony_ci
104162306a36Sopenharmony_cistatic struct cftype iolatency_files[] = {
104262306a36Sopenharmony_ci	{
104362306a36Sopenharmony_ci		.name = "latency",
104462306a36Sopenharmony_ci		.flags = CFTYPE_NOT_ON_ROOT,
104562306a36Sopenharmony_ci		.seq_show = iolatency_print_limit,
104662306a36Sopenharmony_ci		.write = iolatency_set_limit,
104762306a36Sopenharmony_ci	},
104862306a36Sopenharmony_ci	{}
104962306a36Sopenharmony_ci};
105062306a36Sopenharmony_ci
105162306a36Sopenharmony_cistatic struct blkcg_policy blkcg_policy_iolatency = {
105262306a36Sopenharmony_ci	.dfl_cftypes	= iolatency_files,
105362306a36Sopenharmony_ci	.pd_alloc_fn	= iolatency_pd_alloc,
105462306a36Sopenharmony_ci	.pd_init_fn	= iolatency_pd_init,
105562306a36Sopenharmony_ci	.pd_offline_fn	= iolatency_pd_offline,
105662306a36Sopenharmony_ci	.pd_free_fn	= iolatency_pd_free,
105762306a36Sopenharmony_ci	.pd_stat_fn	= iolatency_pd_stat,
105862306a36Sopenharmony_ci};
105962306a36Sopenharmony_ci
106062306a36Sopenharmony_cistatic int __init iolatency_init(void)
106162306a36Sopenharmony_ci{
106262306a36Sopenharmony_ci	return blkcg_policy_register(&blkcg_policy_iolatency);
106362306a36Sopenharmony_ci}
106462306a36Sopenharmony_ci
106562306a36Sopenharmony_cistatic void __exit iolatency_exit(void)
106662306a36Sopenharmony_ci{
106762306a36Sopenharmony_ci	blkcg_policy_unregister(&blkcg_policy_iolatency);
106862306a36Sopenharmony_ci}
106962306a36Sopenharmony_ci
107062306a36Sopenharmony_cimodule_init(iolatency_init);
107162306a36Sopenharmony_cimodule_exit(iolatency_exit);
1072