162306a36Sopenharmony_ci#ifndef BLK_THROTTLE_H 262306a36Sopenharmony_ci#define BLK_THROTTLE_H 362306a36Sopenharmony_ci 462306a36Sopenharmony_ci#include "blk-cgroup-rwstat.h" 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci/* 762306a36Sopenharmony_ci * To implement hierarchical throttling, throtl_grps form a tree and bios 862306a36Sopenharmony_ci * are dispatched upwards level by level until they reach the top and get 962306a36Sopenharmony_ci * issued. When dispatching bios from the children and local group at each 1062306a36Sopenharmony_ci * level, if the bios are dispatched into a single bio_list, there's a risk 1162306a36Sopenharmony_ci * of a local or child group which can queue many bios at once filling up 1262306a36Sopenharmony_ci * the list starving others. 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * To avoid such starvation, dispatched bios are queued separately 1562306a36Sopenharmony_ci * according to where they came from. When they are again dispatched to 1662306a36Sopenharmony_ci * the parent, they're popped in round-robin order so that no single source 1762306a36Sopenharmony_ci * hogs the dispatch window. 1862306a36Sopenharmony_ci * 1962306a36Sopenharmony_ci * throtl_qnode is used to keep the queued bios separated by their sources. 2062306a36Sopenharmony_ci * Bios are queued to throtl_qnode which in turn is queued to 2162306a36Sopenharmony_ci * throtl_service_queue and then dispatched in round-robin order. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * It's also used to track the reference counts on blkg's. A qnode always 2462306a36Sopenharmony_ci * belongs to a throtl_grp and gets queued on itself or the parent, so 2562306a36Sopenharmony_ci * incrementing the reference of the associated throtl_grp when a qnode is 2662306a36Sopenharmony_ci * queued and decrementing when dequeued is enough to keep the whole blkg 2762306a36Sopenharmony_ci * tree pinned while bios are in flight. 2862306a36Sopenharmony_ci */ 2962306a36Sopenharmony_cistruct throtl_qnode { 3062306a36Sopenharmony_ci struct list_head node; /* service_queue->queued[] */ 3162306a36Sopenharmony_ci struct bio_list bios; /* queued bios */ 3262306a36Sopenharmony_ci struct throtl_grp *tg; /* tg this qnode belongs to */ 3362306a36Sopenharmony_ci}; 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_cistruct throtl_service_queue { 3662306a36Sopenharmony_ci struct throtl_service_queue *parent_sq; /* the parent service_queue */ 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci /* 3962306a36Sopenharmony_ci * Bios queued directly to this service_queue or dispatched from 4062306a36Sopenharmony_ci * children throtl_grp's. 4162306a36Sopenharmony_ci */ 4262306a36Sopenharmony_ci struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ 4362306a36Sopenharmony_ci unsigned int nr_queued[2]; /* number of queued bios */ 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ci /* 4662306a36Sopenharmony_ci * RB tree of active children throtl_grp's, which are sorted by 4762306a36Sopenharmony_ci * their ->disptime. 4862306a36Sopenharmony_ci */ 4962306a36Sopenharmony_ci struct rb_root_cached pending_tree; /* RB tree of active tgs */ 5062306a36Sopenharmony_ci unsigned int nr_pending; /* # queued in the tree */ 5162306a36Sopenharmony_ci unsigned long first_pending_disptime; /* disptime of the first tg */ 5262306a36Sopenharmony_ci struct timer_list pending_timer; /* fires on first_pending_disptime */ 5362306a36Sopenharmony_ci}; 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_cienum tg_state_flags { 5662306a36Sopenharmony_ci THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ 5762306a36Sopenharmony_ci THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ 5862306a36Sopenharmony_ci THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ 5962306a36Sopenharmony_ci}; 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_cienum { 6262306a36Sopenharmony_ci LIMIT_LOW, 6362306a36Sopenharmony_ci LIMIT_MAX, 6462306a36Sopenharmony_ci LIMIT_CNT, 6562306a36Sopenharmony_ci}; 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_cistruct throtl_grp { 6862306a36Sopenharmony_ci /* must be the first member */ 6962306a36Sopenharmony_ci struct blkg_policy_data pd; 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci /* active throtl group service_queue member */ 7262306a36Sopenharmony_ci struct rb_node rb_node; 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci /* throtl_data this group belongs to */ 7562306a36Sopenharmony_ci struct throtl_data *td; 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci /* this group's service queue */ 7862306a36Sopenharmony_ci struct throtl_service_queue service_queue; 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci /* 8162306a36Sopenharmony_ci * qnode_on_self is used when bios are directly queued to this 8262306a36Sopenharmony_ci * throtl_grp so that local bios compete fairly with bios 8362306a36Sopenharmony_ci * dispatched from children. qnode_on_parent is used when bios are 8462306a36Sopenharmony_ci * dispatched from this throtl_grp into its parent and will compete 8562306a36Sopenharmony_ci * with the sibling qnode_on_parents and the parent's 8662306a36Sopenharmony_ci * qnode_on_self. 8762306a36Sopenharmony_ci */ 8862306a36Sopenharmony_ci struct throtl_qnode qnode_on_self[2]; 8962306a36Sopenharmony_ci struct throtl_qnode qnode_on_parent[2]; 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci /* 9262306a36Sopenharmony_ci * Dispatch time in jiffies. This is the estimated time when group 9362306a36Sopenharmony_ci * will unthrottle and is ready to dispatch more bio. It is used as 9462306a36Sopenharmony_ci * key to sort active groups in service tree. 9562306a36Sopenharmony_ci */ 9662306a36Sopenharmony_ci unsigned long disptime; 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci unsigned int flags; 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci /* are there any throtl rules between this group and td? */ 10162306a36Sopenharmony_ci bool has_rules_bps[2]; 10262306a36Sopenharmony_ci bool has_rules_iops[2]; 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci /* internally used bytes per second rate limits */ 10562306a36Sopenharmony_ci uint64_t bps[2][LIMIT_CNT]; 10662306a36Sopenharmony_ci /* user configured bps limits */ 10762306a36Sopenharmony_ci uint64_t bps_conf[2][LIMIT_CNT]; 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci /* internally used IOPS limits */ 11062306a36Sopenharmony_ci unsigned int iops[2][LIMIT_CNT]; 11162306a36Sopenharmony_ci /* user configured IOPS limits */ 11262306a36Sopenharmony_ci unsigned int iops_conf[2][LIMIT_CNT]; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci /* Number of bytes dispatched in current slice */ 11562306a36Sopenharmony_ci uint64_t bytes_disp[2]; 11662306a36Sopenharmony_ci /* Number of bio's dispatched in current slice */ 11762306a36Sopenharmony_ci unsigned int io_disp[2]; 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci unsigned long last_low_overflow_time[2]; 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci uint64_t last_bytes_disp[2]; 12262306a36Sopenharmony_ci unsigned int last_io_disp[2]; 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci /* 12562306a36Sopenharmony_ci * The following two fields are updated when new configuration is 12662306a36Sopenharmony_ci * submitted while some bios are still throttled, they record how many 12762306a36Sopenharmony_ci * bytes/ios are waited already in previous configuration, and they will 12862306a36Sopenharmony_ci * be used to calculate wait time under new configuration. 12962306a36Sopenharmony_ci */ 13062306a36Sopenharmony_ci long long carryover_bytes[2]; 13162306a36Sopenharmony_ci int carryover_ios[2]; 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci unsigned long last_check_time; 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci unsigned long latency_target; /* us */ 13662306a36Sopenharmony_ci unsigned long latency_target_conf; /* us */ 13762306a36Sopenharmony_ci /* When did we start a new slice */ 13862306a36Sopenharmony_ci unsigned long slice_start[2]; 13962306a36Sopenharmony_ci unsigned long slice_end[2]; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci unsigned long last_finish_time; /* ns / 1024 */ 14262306a36Sopenharmony_ci unsigned long checked_last_finish_time; /* ns / 1024 */ 14362306a36Sopenharmony_ci unsigned long avg_idletime; /* ns / 1024 */ 14462306a36Sopenharmony_ci unsigned long idletime_threshold; /* us */ 14562306a36Sopenharmony_ci unsigned long idletime_threshold_conf; /* us */ 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci unsigned int bio_cnt; /* total bios */ 14862306a36Sopenharmony_ci unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ 14962306a36Sopenharmony_ci unsigned long bio_cnt_reset_time; 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci struct blkg_rwstat stat_bytes; 15262306a36Sopenharmony_ci struct blkg_rwstat stat_ios; 15362306a36Sopenharmony_ci}; 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ciextern struct blkcg_policy blkcg_policy_throtl; 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_cistatic inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 15862306a36Sopenharmony_ci{ 15962306a36Sopenharmony_ci return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 16062306a36Sopenharmony_ci} 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_cistatic inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) 16362306a36Sopenharmony_ci{ 16462306a36Sopenharmony_ci return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); 16562306a36Sopenharmony_ci} 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci/* 16862306a36Sopenharmony_ci * Internal throttling interface 16962306a36Sopenharmony_ci */ 17062306a36Sopenharmony_ci#ifndef CONFIG_BLK_DEV_THROTTLING 17162306a36Sopenharmony_cistatic inline int blk_throtl_init(struct gendisk *disk) { return 0; } 17262306a36Sopenharmony_cistatic inline void blk_throtl_exit(struct gendisk *disk) { } 17362306a36Sopenharmony_cistatic inline void blk_throtl_register(struct gendisk *disk) { } 17462306a36Sopenharmony_cistatic inline bool blk_throtl_bio(struct bio *bio) { return false; } 17562306a36Sopenharmony_cistatic inline void blk_throtl_cancel_bios(struct gendisk *disk) { } 17662306a36Sopenharmony_ci#else /* CONFIG_BLK_DEV_THROTTLING */ 17762306a36Sopenharmony_ciint blk_throtl_init(struct gendisk *disk); 17862306a36Sopenharmony_civoid blk_throtl_exit(struct gendisk *disk); 17962306a36Sopenharmony_civoid blk_throtl_register(struct gendisk *disk); 18062306a36Sopenharmony_cibool __blk_throtl_bio(struct bio *bio); 18162306a36Sopenharmony_civoid blk_throtl_cancel_bios(struct gendisk *disk); 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_cistatic inline bool blk_should_throtl(struct bio *bio) 18462306a36Sopenharmony_ci{ 18562306a36Sopenharmony_ci struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); 18662306a36Sopenharmony_ci int rw = bio_data_dir(bio); 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { 18962306a36Sopenharmony_ci if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { 19062306a36Sopenharmony_ci bio_set_flag(bio, BIO_CGROUP_ACCT); 19162306a36Sopenharmony_ci blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, 19262306a36Sopenharmony_ci bio->bi_iter.bi_size); 19362306a36Sopenharmony_ci } 19462306a36Sopenharmony_ci blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); 19562306a36Sopenharmony_ci } 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci /* iops limit is always counted */ 19862306a36Sopenharmony_ci if (tg->has_rules_iops[rw]) 19962306a36Sopenharmony_ci return true; 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ci if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED)) 20262306a36Sopenharmony_ci return true; 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci return false; 20562306a36Sopenharmony_ci} 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_cistatic inline bool blk_throtl_bio(struct bio *bio) 20862306a36Sopenharmony_ci{ 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci if (!blk_should_throtl(bio)) 21162306a36Sopenharmony_ci return false; 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci return __blk_throtl_bio(bio); 21462306a36Sopenharmony_ci} 21562306a36Sopenharmony_ci#endif /* CONFIG_BLK_DEV_THROTTLING */ 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci#endif 218