162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Meant to be mostly used for locally generated traffic : 862306a36Sopenharmony_ci * Fast classification depends on skb->sk being set before reaching us. 962306a36Sopenharmony_ci * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. 1062306a36Sopenharmony_ci * All packets belonging to a socket are considered as a 'flow'. 1162306a36Sopenharmony_ci * 1262306a36Sopenharmony_ci * Flows are dynamically allocated and stored in a hash table of RB trees 1362306a36Sopenharmony_ci * They are also part of one Round Robin 'queues' (new or old flows) 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * Burst avoidance (aka pacing) capability : 1662306a36Sopenharmony_ci * 1762306a36Sopenharmony_ci * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a 1862306a36Sopenharmony_ci * bunch of packets, and this packet scheduler adds delay between 1962306a36Sopenharmony_ci * packets to respect rate limitation. 2062306a36Sopenharmony_ci * 2162306a36Sopenharmony_ci * enqueue() : 2262306a36Sopenharmony_ci * - lookup one RB tree (out of 1024 or more) to find the flow. 2362306a36Sopenharmony_ci * If non existent flow, create it, add it to the tree. 2462306a36Sopenharmony_ci * Add skb to the per flow list of skb (fifo). 2562306a36Sopenharmony_ci * - Use a special fifo for high prio packets 2662306a36Sopenharmony_ci * 2762306a36Sopenharmony_ci * dequeue() : serves flows in Round Robin 2862306a36Sopenharmony_ci * Note : When a flow becomes empty, we do not immediately remove it from 2962306a36Sopenharmony_ci * rb trees, for performance reasons (its expected to send additional packets, 3062306a36Sopenharmony_ci * or SLAB cache will reuse socket for another flow) 3162306a36Sopenharmony_ci */ 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci#include <linux/module.h> 3462306a36Sopenharmony_ci#include <linux/types.h> 3562306a36Sopenharmony_ci#include <linux/kernel.h> 3662306a36Sopenharmony_ci#include <linux/jiffies.h> 3762306a36Sopenharmony_ci#include <linux/string.h> 3862306a36Sopenharmony_ci#include <linux/in.h> 3962306a36Sopenharmony_ci#include <linux/errno.h> 4062306a36Sopenharmony_ci#include <linux/init.h> 4162306a36Sopenharmony_ci#include <linux/skbuff.h> 4262306a36Sopenharmony_ci#include <linux/slab.h> 4362306a36Sopenharmony_ci#include <linux/rbtree.h> 4462306a36Sopenharmony_ci#include <linux/hash.h> 4562306a36Sopenharmony_ci#include <linux/prefetch.h> 4662306a36Sopenharmony_ci#include <linux/vmalloc.h> 4762306a36Sopenharmony_ci#include <net/netlink.h> 4862306a36Sopenharmony_ci#include <net/pkt_sched.h> 4962306a36Sopenharmony_ci#include <net/sock.h> 5062306a36Sopenharmony_ci#include <net/tcp_states.h> 5162306a36Sopenharmony_ci#include <net/tcp.h> 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_cistruct fq_skb_cb { 5462306a36Sopenharmony_ci u64 time_to_send; 5562306a36Sopenharmony_ci}; 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_cistatic inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) 5862306a36Sopenharmony_ci{ 5962306a36Sopenharmony_ci qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb)); 6062306a36Sopenharmony_ci return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; 6162306a36Sopenharmony_ci} 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_ci/* 6462306a36Sopenharmony_ci * Per flow structure, dynamically allocated. 6562306a36Sopenharmony_ci * If packets have monotically increasing time_to_send, they are placed in O(1) 6662306a36Sopenharmony_ci * in linear list (head,tail), otherwise are placed in a rbtree (t_root). 6762306a36Sopenharmony_ci */ 6862306a36Sopenharmony_cistruct fq_flow { 6962306a36Sopenharmony_ci/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ 7062306a36Sopenharmony_ci struct rb_root t_root; 7162306a36Sopenharmony_ci struct sk_buff *head; /* list of skbs for this flow : first skb */ 7262306a36Sopenharmony_ci union { 7362306a36Sopenharmony_ci struct sk_buff *tail; /* last skb in the list */ 7462306a36Sopenharmony_ci unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ 7562306a36Sopenharmony_ci }; 7662306a36Sopenharmony_ci struct rb_node fq_node; /* anchor in fq_root[] trees */ 7762306a36Sopenharmony_ci struct sock *sk; 7862306a36Sopenharmony_ci u32 socket_hash; /* sk_hash */ 7962306a36Sopenharmony_ci int qlen; /* number of packets in flow queue */ 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci/* Second cache line, used in fq_dequeue() */ 8262306a36Sopenharmony_ci int credit; 8362306a36Sopenharmony_ci /* 32bit hole on 64bit arches */ 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci struct fq_flow *next; /* next pointer in RR lists */ 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci struct rb_node rate_node; /* anchor in q->delayed tree */ 8862306a36Sopenharmony_ci u64 time_next_packet; 8962306a36Sopenharmony_ci} ____cacheline_aligned_in_smp; 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_cistruct fq_flow_head { 9262306a36Sopenharmony_ci struct fq_flow *first; 9362306a36Sopenharmony_ci struct fq_flow *last; 9462306a36Sopenharmony_ci}; 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_cistruct fq_sched_data { 9762306a36Sopenharmony_ci struct fq_flow_head new_flows; 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci struct fq_flow_head old_flows; 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci struct rb_root delayed; /* for rate limited flows */ 10262306a36Sopenharmony_ci u64 time_next_delayed_flow; 10362306a36Sopenharmony_ci u64 ktime_cache; /* copy of last ktime_get_ns() */ 10462306a36Sopenharmony_ci unsigned long unthrottle_latency_ns; 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci struct fq_flow internal; /* for non classified or high prio packets */ 10762306a36Sopenharmony_ci u32 quantum; 10862306a36Sopenharmony_ci u32 initial_quantum; 10962306a36Sopenharmony_ci u32 flow_refill_delay; 11062306a36Sopenharmony_ci u32 flow_plimit; /* max packets per flow */ 11162306a36Sopenharmony_ci unsigned long flow_max_rate; /* optional max rate per flow */ 11262306a36Sopenharmony_ci u64 ce_threshold; 11362306a36Sopenharmony_ci u64 horizon; /* horizon in ns */ 11462306a36Sopenharmony_ci u32 orphan_mask; /* mask for orphaned skb */ 11562306a36Sopenharmony_ci u32 low_rate_threshold; 11662306a36Sopenharmony_ci struct rb_root *fq_root; 11762306a36Sopenharmony_ci u8 rate_enable; 11862306a36Sopenharmony_ci u8 fq_trees_log; 11962306a36Sopenharmony_ci u8 horizon_drop; 12062306a36Sopenharmony_ci u32 flows; 12162306a36Sopenharmony_ci u32 inactive_flows; 12262306a36Sopenharmony_ci u32 throttled_flows; 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci u64 stat_gc_flows; 12562306a36Sopenharmony_ci u64 stat_internal_packets; 12662306a36Sopenharmony_ci u64 stat_throttled; 12762306a36Sopenharmony_ci u64 stat_ce_mark; 12862306a36Sopenharmony_ci u64 stat_horizon_drops; 12962306a36Sopenharmony_ci u64 stat_horizon_caps; 13062306a36Sopenharmony_ci u64 stat_flows_plimit; 13162306a36Sopenharmony_ci u64 stat_pkts_too_long; 13262306a36Sopenharmony_ci u64 stat_allocation_errors; 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci u32 timer_slack; /* hrtimer slack in ns */ 13562306a36Sopenharmony_ci struct qdisc_watchdog watchdog; 13662306a36Sopenharmony_ci}; 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci/* 13962306a36Sopenharmony_ci * f->tail and f->age share the same location. 14062306a36Sopenharmony_ci * We can use the low order bit to differentiate if this location points 14162306a36Sopenharmony_ci * to a sk_buff or contains a jiffies value, if we force this value to be odd. 14262306a36Sopenharmony_ci * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2 14362306a36Sopenharmony_ci */ 14462306a36Sopenharmony_cistatic void fq_flow_set_detached(struct fq_flow *f) 14562306a36Sopenharmony_ci{ 14662306a36Sopenharmony_ci f->age = jiffies | 1UL; 14762306a36Sopenharmony_ci} 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_cistatic bool fq_flow_is_detached(const struct fq_flow *f) 15062306a36Sopenharmony_ci{ 15162306a36Sopenharmony_ci return !!(f->age & 1UL); 15262306a36Sopenharmony_ci} 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci/* special value to mark a throttled flow (not on old/new list) */ 15562306a36Sopenharmony_cistatic struct fq_flow throttled; 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_cistatic bool fq_flow_is_throttled(const struct fq_flow *f) 15862306a36Sopenharmony_ci{ 15962306a36Sopenharmony_ci return f->next == &throttled; 16062306a36Sopenharmony_ci} 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_cistatic void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) 16362306a36Sopenharmony_ci{ 16462306a36Sopenharmony_ci if (head->first) 16562306a36Sopenharmony_ci head->last->next = flow; 16662306a36Sopenharmony_ci else 16762306a36Sopenharmony_ci head->first = flow; 16862306a36Sopenharmony_ci head->last = flow; 16962306a36Sopenharmony_ci flow->next = NULL; 17062306a36Sopenharmony_ci} 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_cistatic void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f) 17362306a36Sopenharmony_ci{ 17462306a36Sopenharmony_ci rb_erase(&f->rate_node, &q->delayed); 17562306a36Sopenharmony_ci q->throttled_flows--; 17662306a36Sopenharmony_ci fq_flow_add_tail(&q->old_flows, f); 17762306a36Sopenharmony_ci} 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_cistatic void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) 18062306a36Sopenharmony_ci{ 18162306a36Sopenharmony_ci struct rb_node **p = &q->delayed.rb_node, *parent = NULL; 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci while (*p) { 18462306a36Sopenharmony_ci struct fq_flow *aux; 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci parent = *p; 18762306a36Sopenharmony_ci aux = rb_entry(parent, struct fq_flow, rate_node); 18862306a36Sopenharmony_ci if (f->time_next_packet >= aux->time_next_packet) 18962306a36Sopenharmony_ci p = &parent->rb_right; 19062306a36Sopenharmony_ci else 19162306a36Sopenharmony_ci p = &parent->rb_left; 19262306a36Sopenharmony_ci } 19362306a36Sopenharmony_ci rb_link_node(&f->rate_node, parent, p); 19462306a36Sopenharmony_ci rb_insert_color(&f->rate_node, &q->delayed); 19562306a36Sopenharmony_ci q->throttled_flows++; 19662306a36Sopenharmony_ci q->stat_throttled++; 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci f->next = &throttled; 19962306a36Sopenharmony_ci if (q->time_next_delayed_flow > f->time_next_packet) 20062306a36Sopenharmony_ci q->time_next_delayed_flow = f->time_next_packet; 20162306a36Sopenharmony_ci} 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_cistatic struct kmem_cache *fq_flow_cachep __read_mostly; 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci/* limit number of collected flows per round */ 20862306a36Sopenharmony_ci#define FQ_GC_MAX 8 20962306a36Sopenharmony_ci#define FQ_GC_AGE (3*HZ) 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_cistatic bool fq_gc_candidate(const struct fq_flow *f) 21262306a36Sopenharmony_ci{ 21362306a36Sopenharmony_ci return fq_flow_is_detached(f) && 21462306a36Sopenharmony_ci time_after(jiffies, f->age + FQ_GC_AGE); 21562306a36Sopenharmony_ci} 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_cistatic void fq_gc(struct fq_sched_data *q, 21862306a36Sopenharmony_ci struct rb_root *root, 21962306a36Sopenharmony_ci struct sock *sk) 22062306a36Sopenharmony_ci{ 22162306a36Sopenharmony_ci struct rb_node **p, *parent; 22262306a36Sopenharmony_ci void *tofree[FQ_GC_MAX]; 22362306a36Sopenharmony_ci struct fq_flow *f; 22462306a36Sopenharmony_ci int i, fcnt = 0; 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci p = &root->rb_node; 22762306a36Sopenharmony_ci parent = NULL; 22862306a36Sopenharmony_ci while (*p) { 22962306a36Sopenharmony_ci parent = *p; 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci f = rb_entry(parent, struct fq_flow, fq_node); 23262306a36Sopenharmony_ci if (f->sk == sk) 23362306a36Sopenharmony_ci break; 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci if (fq_gc_candidate(f)) { 23662306a36Sopenharmony_ci tofree[fcnt++] = f; 23762306a36Sopenharmony_ci if (fcnt == FQ_GC_MAX) 23862306a36Sopenharmony_ci break; 23962306a36Sopenharmony_ci } 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci if (f->sk > sk) 24262306a36Sopenharmony_ci p = &parent->rb_right; 24362306a36Sopenharmony_ci else 24462306a36Sopenharmony_ci p = &parent->rb_left; 24562306a36Sopenharmony_ci } 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci if (!fcnt) 24862306a36Sopenharmony_ci return; 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci for (i = fcnt; i > 0; ) { 25162306a36Sopenharmony_ci f = tofree[--i]; 25262306a36Sopenharmony_ci rb_erase(&f->fq_node, root); 25362306a36Sopenharmony_ci } 25462306a36Sopenharmony_ci q->flows -= fcnt; 25562306a36Sopenharmony_ci q->inactive_flows -= fcnt; 25662306a36Sopenharmony_ci q->stat_gc_flows += fcnt; 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); 25962306a36Sopenharmony_ci} 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_cistatic struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) 26262306a36Sopenharmony_ci{ 26362306a36Sopenharmony_ci struct rb_node **p, *parent; 26462306a36Sopenharmony_ci struct sock *sk = skb->sk; 26562306a36Sopenharmony_ci struct rb_root *root; 26662306a36Sopenharmony_ci struct fq_flow *f; 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci /* warning: no starvation prevention... */ 26962306a36Sopenharmony_ci if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) 27062306a36Sopenharmony_ci return &q->internal; 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket 27362306a36Sopenharmony_ci * or a listener (SYNCOOKIE mode) 27462306a36Sopenharmony_ci * 1) request sockets are not full blown, 27562306a36Sopenharmony_ci * they do not contain sk_pacing_rate 27662306a36Sopenharmony_ci * 2) They are not part of a 'flow' yet 27762306a36Sopenharmony_ci * 3) We do not want to rate limit them (eg SYNFLOOD attack), 27862306a36Sopenharmony_ci * especially if the listener set SO_MAX_PACING_RATE 27962306a36Sopenharmony_ci * 4) We pretend they are orphaned 28062306a36Sopenharmony_ci */ 28162306a36Sopenharmony_ci if (!sk || sk_listener(sk)) { 28262306a36Sopenharmony_ci unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci /* By forcing low order bit to 1, we make sure to not 28562306a36Sopenharmony_ci * collide with a local flow (socket pointers are word aligned) 28662306a36Sopenharmony_ci */ 28762306a36Sopenharmony_ci sk = (struct sock *)((hash << 1) | 1UL); 28862306a36Sopenharmony_ci skb_orphan(skb); 28962306a36Sopenharmony_ci } else if (sk->sk_state == TCP_CLOSE) { 29062306a36Sopenharmony_ci unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 29162306a36Sopenharmony_ci /* 29262306a36Sopenharmony_ci * Sockets in TCP_CLOSE are non connected. 29362306a36Sopenharmony_ci * Typical use case is UDP sockets, they can send packets 29462306a36Sopenharmony_ci * with sendto() to many different destinations. 29562306a36Sopenharmony_ci * We probably could use a generic bit advertising 29662306a36Sopenharmony_ci * non connected sockets, instead of sk_state == TCP_CLOSE, 29762306a36Sopenharmony_ci * if we care enough. 29862306a36Sopenharmony_ci */ 29962306a36Sopenharmony_ci sk = (struct sock *)((hash << 1) | 1UL); 30062306a36Sopenharmony_ci } 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci if (q->flows >= (2U << q->fq_trees_log) && 30562306a36Sopenharmony_ci q->inactive_flows > q->flows/2) 30662306a36Sopenharmony_ci fq_gc(q, root, sk); 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci p = &root->rb_node; 30962306a36Sopenharmony_ci parent = NULL; 31062306a36Sopenharmony_ci while (*p) { 31162306a36Sopenharmony_ci parent = *p; 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci f = rb_entry(parent, struct fq_flow, fq_node); 31462306a36Sopenharmony_ci if (f->sk == sk) { 31562306a36Sopenharmony_ci /* socket might have been reallocated, so check 31662306a36Sopenharmony_ci * if its sk_hash is the same. 31762306a36Sopenharmony_ci * It not, we need to refill credit with 31862306a36Sopenharmony_ci * initial quantum 31962306a36Sopenharmony_ci */ 32062306a36Sopenharmony_ci if (unlikely(skb->sk == sk && 32162306a36Sopenharmony_ci f->socket_hash != sk->sk_hash)) { 32262306a36Sopenharmony_ci f->credit = q->initial_quantum; 32362306a36Sopenharmony_ci f->socket_hash = sk->sk_hash; 32462306a36Sopenharmony_ci if (q->rate_enable) 32562306a36Sopenharmony_ci smp_store_release(&sk->sk_pacing_status, 32662306a36Sopenharmony_ci SK_PACING_FQ); 32762306a36Sopenharmony_ci if (fq_flow_is_throttled(f)) 32862306a36Sopenharmony_ci fq_flow_unset_throttled(q, f); 32962306a36Sopenharmony_ci f->time_next_packet = 0ULL; 33062306a36Sopenharmony_ci } 33162306a36Sopenharmony_ci return f; 33262306a36Sopenharmony_ci } 33362306a36Sopenharmony_ci if (f->sk > sk) 33462306a36Sopenharmony_ci p = &parent->rb_right; 33562306a36Sopenharmony_ci else 33662306a36Sopenharmony_ci p = &parent->rb_left; 33762306a36Sopenharmony_ci } 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); 34062306a36Sopenharmony_ci if (unlikely(!f)) { 34162306a36Sopenharmony_ci q->stat_allocation_errors++; 34262306a36Sopenharmony_ci return &q->internal; 34362306a36Sopenharmony_ci } 34462306a36Sopenharmony_ci /* f->t_root is already zeroed after kmem_cache_zalloc() */ 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_ci fq_flow_set_detached(f); 34762306a36Sopenharmony_ci f->sk = sk; 34862306a36Sopenharmony_ci if (skb->sk == sk) { 34962306a36Sopenharmony_ci f->socket_hash = sk->sk_hash; 35062306a36Sopenharmony_ci if (q->rate_enable) 35162306a36Sopenharmony_ci smp_store_release(&sk->sk_pacing_status, 35262306a36Sopenharmony_ci SK_PACING_FQ); 35362306a36Sopenharmony_ci } 35462306a36Sopenharmony_ci f->credit = q->initial_quantum; 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci rb_link_node(&f->fq_node, parent, p); 35762306a36Sopenharmony_ci rb_insert_color(&f->fq_node, root); 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci q->flows++; 36062306a36Sopenharmony_ci q->inactive_flows++; 36162306a36Sopenharmony_ci return f; 36262306a36Sopenharmony_ci} 36362306a36Sopenharmony_ci 36462306a36Sopenharmony_cistatic struct sk_buff *fq_peek(struct fq_flow *flow) 36562306a36Sopenharmony_ci{ 36662306a36Sopenharmony_ci struct sk_buff *skb = skb_rb_first(&flow->t_root); 36762306a36Sopenharmony_ci struct sk_buff *head = flow->head; 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci if (!skb) 37062306a36Sopenharmony_ci return head; 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci if (!head) 37362306a36Sopenharmony_ci return skb; 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_ci if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send) 37662306a36Sopenharmony_ci return skb; 37762306a36Sopenharmony_ci return head; 37862306a36Sopenharmony_ci} 37962306a36Sopenharmony_ci 38062306a36Sopenharmony_cistatic void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, 38162306a36Sopenharmony_ci struct sk_buff *skb) 38262306a36Sopenharmony_ci{ 38362306a36Sopenharmony_ci if (skb == flow->head) { 38462306a36Sopenharmony_ci flow->head = skb->next; 38562306a36Sopenharmony_ci } else { 38662306a36Sopenharmony_ci rb_erase(&skb->rbnode, &flow->t_root); 38762306a36Sopenharmony_ci skb->dev = qdisc_dev(sch); 38862306a36Sopenharmony_ci } 38962306a36Sopenharmony_ci} 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci/* Remove one skb from flow queue. 39262306a36Sopenharmony_ci * This skb must be the return value of prior fq_peek(). 39362306a36Sopenharmony_ci */ 39462306a36Sopenharmony_cistatic void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, 39562306a36Sopenharmony_ci struct sk_buff *skb) 39662306a36Sopenharmony_ci{ 39762306a36Sopenharmony_ci fq_erase_head(sch, flow, skb); 39862306a36Sopenharmony_ci skb_mark_not_on_list(skb); 39962306a36Sopenharmony_ci flow->qlen--; 40062306a36Sopenharmony_ci qdisc_qstats_backlog_dec(sch, skb); 40162306a36Sopenharmony_ci sch->q.qlen--; 40262306a36Sopenharmony_ci} 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_cistatic void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) 40562306a36Sopenharmony_ci{ 40662306a36Sopenharmony_ci struct rb_node **p, *parent; 40762306a36Sopenharmony_ci struct sk_buff *head, *aux; 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci head = flow->head; 41062306a36Sopenharmony_ci if (!head || 41162306a36Sopenharmony_ci fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { 41262306a36Sopenharmony_ci if (!head) 41362306a36Sopenharmony_ci flow->head = skb; 41462306a36Sopenharmony_ci else 41562306a36Sopenharmony_ci flow->tail->next = skb; 41662306a36Sopenharmony_ci flow->tail = skb; 41762306a36Sopenharmony_ci skb->next = NULL; 41862306a36Sopenharmony_ci return; 41962306a36Sopenharmony_ci } 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_ci p = &flow->t_root.rb_node; 42262306a36Sopenharmony_ci parent = NULL; 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci while (*p) { 42562306a36Sopenharmony_ci parent = *p; 42662306a36Sopenharmony_ci aux = rb_to_skb(parent); 42762306a36Sopenharmony_ci if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send) 42862306a36Sopenharmony_ci p = &parent->rb_right; 42962306a36Sopenharmony_ci else 43062306a36Sopenharmony_ci p = &parent->rb_left; 43162306a36Sopenharmony_ci } 43262306a36Sopenharmony_ci rb_link_node(&skb->rbnode, parent, p); 43362306a36Sopenharmony_ci rb_insert_color(&skb->rbnode, &flow->t_root); 43462306a36Sopenharmony_ci} 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_cistatic bool fq_packet_beyond_horizon(const struct sk_buff *skb, 43762306a36Sopenharmony_ci const struct fq_sched_data *q) 43862306a36Sopenharmony_ci{ 43962306a36Sopenharmony_ci return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); 44062306a36Sopenharmony_ci} 44162306a36Sopenharmony_ci 44262306a36Sopenharmony_cistatic int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, 44362306a36Sopenharmony_ci struct sk_buff **to_free) 44462306a36Sopenharmony_ci{ 44562306a36Sopenharmony_ci struct fq_sched_data *q = qdisc_priv(sch); 44662306a36Sopenharmony_ci struct fq_flow *f; 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci if (unlikely(sch->q.qlen >= sch->limit)) 44962306a36Sopenharmony_ci return qdisc_drop(skb, sch, to_free); 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci if (!skb->tstamp) { 45262306a36Sopenharmony_ci fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); 45362306a36Sopenharmony_ci } else { 45462306a36Sopenharmony_ci /* Check if packet timestamp is too far in the future. 45562306a36Sopenharmony_ci * Try first if our cached value, to avoid ktime_get_ns() 45662306a36Sopenharmony_ci * cost in most cases. 45762306a36Sopenharmony_ci */ 45862306a36Sopenharmony_ci if (fq_packet_beyond_horizon(skb, q)) { 45962306a36Sopenharmony_ci /* Refresh our cache and check another time */ 46062306a36Sopenharmony_ci q->ktime_cache = ktime_get_ns(); 46162306a36Sopenharmony_ci if (fq_packet_beyond_horizon(skb, q)) { 46262306a36Sopenharmony_ci if (q->horizon_drop) { 46362306a36Sopenharmony_ci q->stat_horizon_drops++; 46462306a36Sopenharmony_ci return qdisc_drop(skb, sch, to_free); 46562306a36Sopenharmony_ci } 46662306a36Sopenharmony_ci q->stat_horizon_caps++; 46762306a36Sopenharmony_ci skb->tstamp = q->ktime_cache + q->horizon; 46862306a36Sopenharmony_ci } 46962306a36Sopenharmony_ci } 47062306a36Sopenharmony_ci fq_skb_cb(skb)->time_to_send = skb->tstamp; 47162306a36Sopenharmony_ci } 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci f = fq_classify(skb, q); 47462306a36Sopenharmony_ci if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { 47562306a36Sopenharmony_ci q->stat_flows_plimit++; 47662306a36Sopenharmony_ci return qdisc_drop(skb, sch, to_free); 47762306a36Sopenharmony_ci } 47862306a36Sopenharmony_ci 47962306a36Sopenharmony_ci f->qlen++; 48062306a36Sopenharmony_ci qdisc_qstats_backlog_inc(sch, skb); 48162306a36Sopenharmony_ci if (fq_flow_is_detached(f)) { 48262306a36Sopenharmony_ci fq_flow_add_tail(&q->new_flows, f); 48362306a36Sopenharmony_ci if (time_after(jiffies, f->age + q->flow_refill_delay)) 48462306a36Sopenharmony_ci f->credit = max_t(u32, f->credit, q->quantum); 48562306a36Sopenharmony_ci q->inactive_flows--; 48662306a36Sopenharmony_ci } 48762306a36Sopenharmony_ci 48862306a36Sopenharmony_ci /* Note: this overwrites f->age */ 48962306a36Sopenharmony_ci flow_queue_add(f, skb); 49062306a36Sopenharmony_ci 49162306a36Sopenharmony_ci if (unlikely(f == &q->internal)) { 49262306a36Sopenharmony_ci q->stat_internal_packets++; 49362306a36Sopenharmony_ci } 49462306a36Sopenharmony_ci sch->q.qlen++; 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_ci return NET_XMIT_SUCCESS; 49762306a36Sopenharmony_ci} 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_cistatic void fq_check_throttled(struct fq_sched_data *q, u64 now) 50062306a36Sopenharmony_ci{ 50162306a36Sopenharmony_ci unsigned long sample; 50262306a36Sopenharmony_ci struct rb_node *p; 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_ci if (q->time_next_delayed_flow > now) 50562306a36Sopenharmony_ci return; 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci /* Update unthrottle latency EWMA. 50862306a36Sopenharmony_ci * This is cheap and can help diagnosing timer/latency problems. 50962306a36Sopenharmony_ci */ 51062306a36Sopenharmony_ci sample = (unsigned long)(now - q->time_next_delayed_flow); 51162306a36Sopenharmony_ci q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; 51262306a36Sopenharmony_ci q->unthrottle_latency_ns += sample >> 3; 51362306a36Sopenharmony_ci 51462306a36Sopenharmony_ci q->time_next_delayed_flow = ~0ULL; 51562306a36Sopenharmony_ci while ((p = rb_first(&q->delayed)) != NULL) { 51662306a36Sopenharmony_ci struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node); 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci if (f->time_next_packet > now) { 51962306a36Sopenharmony_ci q->time_next_delayed_flow = f->time_next_packet; 52062306a36Sopenharmony_ci break; 52162306a36Sopenharmony_ci } 52262306a36Sopenharmony_ci fq_flow_unset_throttled(q, f); 52362306a36Sopenharmony_ci } 52462306a36Sopenharmony_ci} 52562306a36Sopenharmony_ci 52662306a36Sopenharmony_cistatic struct sk_buff *fq_dequeue(struct Qdisc *sch) 52762306a36Sopenharmony_ci{ 52862306a36Sopenharmony_ci struct fq_sched_data *q = qdisc_priv(sch); 52962306a36Sopenharmony_ci struct fq_flow_head *head; 53062306a36Sopenharmony_ci struct sk_buff *skb; 53162306a36Sopenharmony_ci struct fq_flow *f; 53262306a36Sopenharmony_ci unsigned long rate; 53362306a36Sopenharmony_ci u32 plen; 53462306a36Sopenharmony_ci u64 now; 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci if (!sch->q.qlen) 53762306a36Sopenharmony_ci return NULL; 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci skb = fq_peek(&q->internal); 54062306a36Sopenharmony_ci if (unlikely(skb)) { 54162306a36Sopenharmony_ci fq_dequeue_skb(sch, &q->internal, skb); 54262306a36Sopenharmony_ci goto out; 54362306a36Sopenharmony_ci } 54462306a36Sopenharmony_ci 54562306a36Sopenharmony_ci q->ktime_cache = now = ktime_get_ns(); 54662306a36Sopenharmony_ci fq_check_throttled(q, now); 54762306a36Sopenharmony_cibegin: 54862306a36Sopenharmony_ci head = &q->new_flows; 54962306a36Sopenharmony_ci if (!head->first) { 55062306a36Sopenharmony_ci head = &q->old_flows; 55162306a36Sopenharmony_ci if (!head->first) { 55262306a36Sopenharmony_ci if (q->time_next_delayed_flow != ~0ULL) 55362306a36Sopenharmony_ci qdisc_watchdog_schedule_range_ns(&q->watchdog, 55462306a36Sopenharmony_ci q->time_next_delayed_flow, 55562306a36Sopenharmony_ci q->timer_slack); 55662306a36Sopenharmony_ci return NULL; 55762306a36Sopenharmony_ci } 55862306a36Sopenharmony_ci } 55962306a36Sopenharmony_ci f = head->first; 56062306a36Sopenharmony_ci 56162306a36Sopenharmony_ci if (f->credit <= 0) { 56262306a36Sopenharmony_ci f->credit += q->quantum; 56362306a36Sopenharmony_ci head->first = f->next; 56462306a36Sopenharmony_ci fq_flow_add_tail(&q->old_flows, f); 56562306a36Sopenharmony_ci goto begin; 56662306a36Sopenharmony_ci } 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_ci skb = fq_peek(f); 56962306a36Sopenharmony_ci if (skb) { 57062306a36Sopenharmony_ci u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, 57162306a36Sopenharmony_ci f->time_next_packet); 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci if (now < time_next_packet) { 57462306a36Sopenharmony_ci head->first = f->next; 57562306a36Sopenharmony_ci f->time_next_packet = time_next_packet; 57662306a36Sopenharmony_ci fq_flow_set_throttled(q, f); 57762306a36Sopenharmony_ci goto begin; 57862306a36Sopenharmony_ci } 57962306a36Sopenharmony_ci prefetch(&skb->end); 58062306a36Sopenharmony_ci if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { 58162306a36Sopenharmony_ci INET_ECN_set_ce(skb); 58262306a36Sopenharmony_ci q->stat_ce_mark++; 58362306a36Sopenharmony_ci } 58462306a36Sopenharmony_ci fq_dequeue_skb(sch, f, skb); 58562306a36Sopenharmony_ci } else { 58662306a36Sopenharmony_ci head->first = f->next; 58762306a36Sopenharmony_ci /* force a pass through old_flows to prevent starvation */ 58862306a36Sopenharmony_ci if ((head == &q->new_flows) && q->old_flows.first) { 58962306a36Sopenharmony_ci fq_flow_add_tail(&q->old_flows, f); 59062306a36Sopenharmony_ci } else { 59162306a36Sopenharmony_ci fq_flow_set_detached(f); 59262306a36Sopenharmony_ci q->inactive_flows++; 59362306a36Sopenharmony_ci } 59462306a36Sopenharmony_ci goto begin; 59562306a36Sopenharmony_ci } 59662306a36Sopenharmony_ci plen = qdisc_pkt_len(skb); 59762306a36Sopenharmony_ci f->credit -= plen; 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_ci if (!q->rate_enable) 60062306a36Sopenharmony_ci goto out; 60162306a36Sopenharmony_ci 60262306a36Sopenharmony_ci rate = q->flow_max_rate; 60362306a36Sopenharmony_ci 60462306a36Sopenharmony_ci /* If EDT time was provided for this skb, we need to 60562306a36Sopenharmony_ci * update f->time_next_packet only if this qdisc enforces 60662306a36Sopenharmony_ci * a flow max rate. 60762306a36Sopenharmony_ci */ 60862306a36Sopenharmony_ci if (!skb->tstamp) { 60962306a36Sopenharmony_ci if (skb->sk) 61062306a36Sopenharmony_ci rate = min(skb->sk->sk_pacing_rate, rate); 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci if (rate <= q->low_rate_threshold) { 61362306a36Sopenharmony_ci f->credit = 0; 61462306a36Sopenharmony_ci } else { 61562306a36Sopenharmony_ci plen = max(plen, q->quantum); 61662306a36Sopenharmony_ci if (f->credit > 0) 61762306a36Sopenharmony_ci goto out; 61862306a36Sopenharmony_ci } 61962306a36Sopenharmony_ci } 62062306a36Sopenharmony_ci if (rate != ~0UL) { 62162306a36Sopenharmony_ci u64 len = (u64)plen * NSEC_PER_SEC; 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_ci if (likely(rate)) 62462306a36Sopenharmony_ci len = div64_ul(len, rate); 62562306a36Sopenharmony_ci /* Since socket rate can change later, 62662306a36Sopenharmony_ci * clamp the delay to 1 second. 62762306a36Sopenharmony_ci * Really, providers of too big packets should be fixed ! 62862306a36Sopenharmony_ci */ 62962306a36Sopenharmony_ci if (unlikely(len > NSEC_PER_SEC)) { 63062306a36Sopenharmony_ci len = NSEC_PER_SEC; 63162306a36Sopenharmony_ci q->stat_pkts_too_long++; 63262306a36Sopenharmony_ci } 63362306a36Sopenharmony_ci /* Account for schedule/timers drifts. 63462306a36Sopenharmony_ci * f->time_next_packet was set when prior packet was sent, 63562306a36Sopenharmony_ci * and current time (@now) can be too late by tens of us. 63662306a36Sopenharmony_ci */ 63762306a36Sopenharmony_ci if (f->time_next_packet) 63862306a36Sopenharmony_ci len -= min(len/2, now - f->time_next_packet); 63962306a36Sopenharmony_ci f->time_next_packet = now + len; 64062306a36Sopenharmony_ci } 64162306a36Sopenharmony_ciout: 64262306a36Sopenharmony_ci qdisc_bstats_update(sch, skb); 64362306a36Sopenharmony_ci return skb; 64462306a36Sopenharmony_ci} 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_cistatic void fq_flow_purge(struct fq_flow *flow) 64762306a36Sopenharmony_ci{ 64862306a36Sopenharmony_ci struct rb_node *p = rb_first(&flow->t_root); 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci while (p) { 65162306a36Sopenharmony_ci struct sk_buff *skb = rb_to_skb(p); 65262306a36Sopenharmony_ci 65362306a36Sopenharmony_ci p = rb_next(p); 65462306a36Sopenharmony_ci rb_erase(&skb->rbnode, &flow->t_root); 65562306a36Sopenharmony_ci rtnl_kfree_skbs(skb, skb); 65662306a36Sopenharmony_ci } 65762306a36Sopenharmony_ci rtnl_kfree_skbs(flow->head, flow->tail); 65862306a36Sopenharmony_ci flow->head = NULL; 65962306a36Sopenharmony_ci flow->qlen = 0; 66062306a36Sopenharmony_ci} 66162306a36Sopenharmony_ci 66262306a36Sopenharmony_cistatic void fq_reset(struct Qdisc *sch) 66362306a36Sopenharmony_ci{ 66462306a36Sopenharmony_ci struct fq_sched_data *q = qdisc_priv(sch); 66562306a36Sopenharmony_ci struct rb_root *root; 66662306a36Sopenharmony_ci struct rb_node *p; 66762306a36Sopenharmony_ci struct fq_flow *f; 66862306a36Sopenharmony_ci unsigned int idx; 66962306a36Sopenharmony_ci 67062306a36Sopenharmony_ci sch->q.qlen = 0; 67162306a36Sopenharmony_ci sch->qstats.backlog = 0; 67262306a36Sopenharmony_ci 67362306a36Sopenharmony_ci fq_flow_purge(&q->internal); 67462306a36Sopenharmony_ci 67562306a36Sopenharmony_ci if (!q->fq_root) 67662306a36Sopenharmony_ci return; 67762306a36Sopenharmony_ci 67862306a36Sopenharmony_ci for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { 67962306a36Sopenharmony_ci root = &q->fq_root[idx]; 68062306a36Sopenharmony_ci while ((p = rb_first(root)) != NULL) { 68162306a36Sopenharmony_ci f = rb_entry(p, struct fq_flow, fq_node); 68262306a36Sopenharmony_ci rb_erase(p, root); 68362306a36Sopenharmony_ci 68462306a36Sopenharmony_ci fq_flow_purge(f); 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci kmem_cache_free(fq_flow_cachep, f); 68762306a36Sopenharmony_ci } 68862306a36Sopenharmony_ci } 68962306a36Sopenharmony_ci q->new_flows.first = NULL; 69062306a36Sopenharmony_ci q->old_flows.first = NULL; 69162306a36Sopenharmony_ci q->delayed = RB_ROOT; 69262306a36Sopenharmony_ci q->flows = 0; 69362306a36Sopenharmony_ci q->inactive_flows = 0; 69462306a36Sopenharmony_ci q->throttled_flows = 0; 69562306a36Sopenharmony_ci} 69662306a36Sopenharmony_ci 69762306a36Sopenharmony_cistatic void fq_rehash(struct fq_sched_data *q, 69862306a36Sopenharmony_ci struct rb_root *old_array, u32 old_log, 69962306a36Sopenharmony_ci struct rb_root *new_array, u32 new_log) 70062306a36Sopenharmony_ci{ 70162306a36Sopenharmony_ci struct rb_node *op, **np, *parent; 70262306a36Sopenharmony_ci struct rb_root *oroot, *nroot; 70362306a36Sopenharmony_ci struct fq_flow *of, *nf; 70462306a36Sopenharmony_ci int fcnt = 0; 70562306a36Sopenharmony_ci u32 idx; 70662306a36Sopenharmony_ci 70762306a36Sopenharmony_ci for (idx = 0; idx < (1U << old_log); idx++) { 70862306a36Sopenharmony_ci oroot = &old_array[idx]; 70962306a36Sopenharmony_ci while ((op = rb_first(oroot)) != NULL) { 71062306a36Sopenharmony_ci rb_erase(op, oroot); 71162306a36Sopenharmony_ci of = rb_entry(op, struct fq_flow, fq_node); 71262306a36Sopenharmony_ci if (fq_gc_candidate(of)) { 71362306a36Sopenharmony_ci fcnt++; 71462306a36Sopenharmony_ci kmem_cache_free(fq_flow_cachep, of); 71562306a36Sopenharmony_ci continue; 71662306a36Sopenharmony_ci } 71762306a36Sopenharmony_ci nroot = &new_array[hash_ptr(of->sk, new_log)]; 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_ci np = &nroot->rb_node; 72062306a36Sopenharmony_ci parent = NULL; 72162306a36Sopenharmony_ci while (*np) { 72262306a36Sopenharmony_ci parent = *np; 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci nf = rb_entry(parent, struct fq_flow, fq_node); 72562306a36Sopenharmony_ci BUG_ON(nf->sk == of->sk); 72662306a36Sopenharmony_ci 72762306a36Sopenharmony_ci if (nf->sk > of->sk) 72862306a36Sopenharmony_ci np = &parent->rb_right; 72962306a36Sopenharmony_ci else 73062306a36Sopenharmony_ci np = &parent->rb_left; 73162306a36Sopenharmony_ci } 73262306a36Sopenharmony_ci 73362306a36Sopenharmony_ci rb_link_node(&of->fq_node, parent, np); 73462306a36Sopenharmony_ci rb_insert_color(&of->fq_node, nroot); 73562306a36Sopenharmony_ci } 73662306a36Sopenharmony_ci } 73762306a36Sopenharmony_ci q->flows -= fcnt; 73862306a36Sopenharmony_ci q->inactive_flows -= fcnt; 73962306a36Sopenharmony_ci q->stat_gc_flows += fcnt; 74062306a36Sopenharmony_ci} 74162306a36Sopenharmony_ci 74262306a36Sopenharmony_cistatic void fq_free(void *addr) 74362306a36Sopenharmony_ci{ 74462306a36Sopenharmony_ci kvfree(addr); 74562306a36Sopenharmony_ci} 74662306a36Sopenharmony_ci 74762306a36Sopenharmony_cistatic int fq_resize(struct Qdisc *sch, u32 log) 74862306a36Sopenharmony_ci{ 74962306a36Sopenharmony_ci struct fq_sched_data *q = qdisc_priv(sch); 75062306a36Sopenharmony_ci struct rb_root *array; 75162306a36Sopenharmony_ci void *old_fq_root; 75262306a36Sopenharmony_ci u32 idx; 75362306a36Sopenharmony_ci 75462306a36Sopenharmony_ci if (q->fq_root && log == q->fq_trees_log) 75562306a36Sopenharmony_ci return 0; 75662306a36Sopenharmony_ci 75762306a36Sopenharmony_ci /* If XPS was setup, we can allocate memory on right NUMA node */ 75862306a36Sopenharmony_ci array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL, 75962306a36Sopenharmony_ci netdev_queue_numa_node_read(sch->dev_queue)); 76062306a36Sopenharmony_ci if (!array) 76162306a36Sopenharmony_ci return -ENOMEM; 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_ci for (idx = 0; idx < (1U << log); idx++) 76462306a36Sopenharmony_ci array[idx] = RB_ROOT; 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci sch_tree_lock(sch); 76762306a36Sopenharmony_ci 76862306a36Sopenharmony_ci old_fq_root = q->fq_root; 76962306a36Sopenharmony_ci if (old_fq_root) 77062306a36Sopenharmony_ci fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); 77162306a36Sopenharmony_ci 77262306a36Sopenharmony_ci q->fq_root = array; 77362306a36Sopenharmony_ci q->fq_trees_log = log; 77462306a36Sopenharmony_ci 77562306a36Sopenharmony_ci sch_tree_unlock(sch); 77662306a36Sopenharmony_ci 77762306a36Sopenharmony_ci fq_free(old_fq_root); 77862306a36Sopenharmony_ci 77962306a36Sopenharmony_ci return 0; 78062306a36Sopenharmony_ci} 78162306a36Sopenharmony_ci 78262306a36Sopenharmony_cistatic struct netlink_range_validation iq_range = { 78362306a36Sopenharmony_ci .max = INT_MAX, 78462306a36Sopenharmony_ci}; 78562306a36Sopenharmony_ci 78662306a36Sopenharmony_cistatic const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { 78762306a36Sopenharmony_ci [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_ci [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, 79062306a36Sopenharmony_ci [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, 79162306a36Sopenharmony_ci [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, 79262306a36Sopenharmony_ci [TCA_FQ_INITIAL_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range), 79362306a36Sopenharmony_ci [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, 79462306a36Sopenharmony_ci [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, 79562306a36Sopenharmony_ci [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, 79662306a36Sopenharmony_ci [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, 79762306a36Sopenharmony_ci [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, 79862306a36Sopenharmony_ci [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, 79962306a36Sopenharmony_ci [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, 80062306a36Sopenharmony_ci [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, 80162306a36Sopenharmony_ci [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, 80262306a36Sopenharmony_ci [TCA_FQ_HORIZON] = { .type = NLA_U32 }, 80362306a36Sopenharmony_ci [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, 80462306a36Sopenharmony_ci}; 80562306a36Sopenharmony_ci 80662306a36Sopenharmony_cistatic int fq_change(struct Qdisc *sch, struct nlattr *opt, 80762306a36Sopenharmony_ci struct netlink_ext_ack *extack) 80862306a36Sopenharmony_ci{ 80962306a36Sopenharmony_ci struct fq_sched_data *q = qdisc_priv(sch); 81062306a36Sopenharmony_ci struct nlattr *tb[TCA_FQ_MAX + 1]; 81162306a36Sopenharmony_ci int err, drop_count = 0; 81262306a36Sopenharmony_ci unsigned drop_len = 0; 81362306a36Sopenharmony_ci u32 fq_log; 81462306a36Sopenharmony_ci 81562306a36Sopenharmony_ci err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, 81662306a36Sopenharmony_ci NULL); 81762306a36Sopenharmony_ci if (err < 0) 81862306a36Sopenharmony_ci return err; 81962306a36Sopenharmony_ci 82062306a36Sopenharmony_ci sch_tree_lock(sch); 82162306a36Sopenharmony_ci 82262306a36Sopenharmony_ci fq_log = q->fq_trees_log; 82362306a36Sopenharmony_ci 82462306a36Sopenharmony_ci if (tb[TCA_FQ_BUCKETS_LOG]) { 82562306a36Sopenharmony_ci u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); 82662306a36Sopenharmony_ci 82762306a36Sopenharmony_ci if (nval >= 1 && nval <= ilog2(256*1024)) 82862306a36Sopenharmony_ci fq_log = nval; 82962306a36Sopenharmony_ci else 83062306a36Sopenharmony_ci err = -EINVAL; 83162306a36Sopenharmony_ci } 83262306a36Sopenharmony_ci if (tb[TCA_FQ_PLIMIT]) 83362306a36Sopenharmony_ci sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); 83462306a36Sopenharmony_ci 83562306a36Sopenharmony_ci if (tb[TCA_FQ_FLOW_PLIMIT]) 83662306a36Sopenharmony_ci q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); 83762306a36Sopenharmony_ci 83862306a36Sopenharmony_ci if (tb[TCA_FQ_QUANTUM]) { 83962306a36Sopenharmony_ci u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_ci if (quantum > 0 && quantum <= (1 << 20)) { 84262306a36Sopenharmony_ci q->quantum = quantum; 84362306a36Sopenharmony_ci } else { 84462306a36Sopenharmony_ci NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); 84562306a36Sopenharmony_ci err = -EINVAL; 84662306a36Sopenharmony_ci } 84762306a36Sopenharmony_ci } 84862306a36Sopenharmony_ci 84962306a36Sopenharmony_ci if (tb[TCA_FQ_INITIAL_QUANTUM]) 85062306a36Sopenharmony_ci q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); 85162306a36Sopenharmony_ci 85262306a36Sopenharmony_ci if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) 85362306a36Sopenharmony_ci pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", 85462306a36Sopenharmony_ci nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); 85562306a36Sopenharmony_ci 85662306a36Sopenharmony_ci if (tb[TCA_FQ_FLOW_MAX_RATE]) { 85762306a36Sopenharmony_ci u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); 85862306a36Sopenharmony_ci 85962306a36Sopenharmony_ci q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; 86062306a36Sopenharmony_ci } 86162306a36Sopenharmony_ci if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) 86262306a36Sopenharmony_ci q->low_rate_threshold = 86362306a36Sopenharmony_ci nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); 86462306a36Sopenharmony_ci 86562306a36Sopenharmony_ci if (tb[TCA_FQ_RATE_ENABLE]) { 86662306a36Sopenharmony_ci u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_ci if (enable <= 1) 86962306a36Sopenharmony_ci q->rate_enable = enable; 87062306a36Sopenharmony_ci else 87162306a36Sopenharmony_ci err = -EINVAL; 87262306a36Sopenharmony_ci } 87362306a36Sopenharmony_ci 87462306a36Sopenharmony_ci if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { 87562306a36Sopenharmony_ci u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; 87662306a36Sopenharmony_ci 87762306a36Sopenharmony_ci q->flow_refill_delay = usecs_to_jiffies(usecs_delay); 87862306a36Sopenharmony_ci } 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_ci if (tb[TCA_FQ_ORPHAN_MASK]) 88162306a36Sopenharmony_ci q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); 88262306a36Sopenharmony_ci 88362306a36Sopenharmony_ci if (tb[TCA_FQ_CE_THRESHOLD]) 88462306a36Sopenharmony_ci q->ce_threshold = (u64)NSEC_PER_USEC * 88562306a36Sopenharmony_ci nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); 88662306a36Sopenharmony_ci 88762306a36Sopenharmony_ci if (tb[TCA_FQ_TIMER_SLACK]) 88862306a36Sopenharmony_ci q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); 88962306a36Sopenharmony_ci 89062306a36Sopenharmony_ci if (tb[TCA_FQ_HORIZON]) 89162306a36Sopenharmony_ci q->horizon = (u64)NSEC_PER_USEC * 89262306a36Sopenharmony_ci nla_get_u32(tb[TCA_FQ_HORIZON]); 89362306a36Sopenharmony_ci 89462306a36Sopenharmony_ci if (tb[TCA_FQ_HORIZON_DROP]) 89562306a36Sopenharmony_ci q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); 89662306a36Sopenharmony_ci 89762306a36Sopenharmony_ci if (!err) { 89862306a36Sopenharmony_ci 89962306a36Sopenharmony_ci sch_tree_unlock(sch); 90062306a36Sopenharmony_ci err = fq_resize(sch, fq_log); 90162306a36Sopenharmony_ci sch_tree_lock(sch); 90262306a36Sopenharmony_ci } 90362306a36Sopenharmony_ci while (sch->q.qlen > sch->limit) { 90462306a36Sopenharmony_ci struct sk_buff *skb = fq_dequeue(sch); 90562306a36Sopenharmony_ci 90662306a36Sopenharmony_ci if (!skb) 90762306a36Sopenharmony_ci break; 90862306a36Sopenharmony_ci drop_len += qdisc_pkt_len(skb); 90962306a36Sopenharmony_ci rtnl_kfree_skbs(skb, skb); 91062306a36Sopenharmony_ci drop_count++; 91162306a36Sopenharmony_ci } 91262306a36Sopenharmony_ci qdisc_tree_reduce_backlog(sch, drop_count, drop_len); 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_ci sch_tree_unlock(sch); 91562306a36Sopenharmony_ci return err; 91662306a36Sopenharmony_ci} 91762306a36Sopenharmony_ci 91862306a36Sopenharmony_cistatic void fq_destroy(struct Qdisc *sch) 91962306a36Sopenharmony_ci{ 92062306a36Sopenharmony_ci struct fq_sched_data *q = qdisc_priv(sch); 92162306a36Sopenharmony_ci 92262306a36Sopenharmony_ci fq_reset(sch); 92362306a36Sopenharmony_ci fq_free(q->fq_root); 92462306a36Sopenharmony_ci qdisc_watchdog_cancel(&q->watchdog); 92562306a36Sopenharmony_ci} 92662306a36Sopenharmony_ci 92762306a36Sopenharmony_cistatic int fq_init(struct Qdisc *sch, struct nlattr *opt, 92862306a36Sopenharmony_ci struct netlink_ext_ack *extack) 92962306a36Sopenharmony_ci{ 93062306a36Sopenharmony_ci struct fq_sched_data *q = qdisc_priv(sch); 93162306a36Sopenharmony_ci int err; 93262306a36Sopenharmony_ci 93362306a36Sopenharmony_ci sch->limit = 10000; 93462306a36Sopenharmony_ci q->flow_plimit = 100; 93562306a36Sopenharmony_ci q->quantum = 2 * psched_mtu(qdisc_dev(sch)); 93662306a36Sopenharmony_ci q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); 93762306a36Sopenharmony_ci q->flow_refill_delay = msecs_to_jiffies(40); 93862306a36Sopenharmony_ci q->flow_max_rate = ~0UL; 93962306a36Sopenharmony_ci q->time_next_delayed_flow = ~0ULL; 94062306a36Sopenharmony_ci q->rate_enable = 1; 94162306a36Sopenharmony_ci q->new_flows.first = NULL; 94262306a36Sopenharmony_ci q->old_flows.first = NULL; 94362306a36Sopenharmony_ci q->delayed = RB_ROOT; 94462306a36Sopenharmony_ci q->fq_root = NULL; 94562306a36Sopenharmony_ci q->fq_trees_log = ilog2(1024); 94662306a36Sopenharmony_ci q->orphan_mask = 1024 - 1; 94762306a36Sopenharmony_ci q->low_rate_threshold = 550000 / 8; 94862306a36Sopenharmony_ci 94962306a36Sopenharmony_ci q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ 95062306a36Sopenharmony_ci 95162306a36Sopenharmony_ci q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ 95262306a36Sopenharmony_ci q->horizon_drop = 1; /* by default, drop packets beyond horizon */ 95362306a36Sopenharmony_ci 95462306a36Sopenharmony_ci /* Default ce_threshold of 4294 seconds */ 95562306a36Sopenharmony_ci q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; 95662306a36Sopenharmony_ci 95762306a36Sopenharmony_ci qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); 95862306a36Sopenharmony_ci 95962306a36Sopenharmony_ci if (opt) 96062306a36Sopenharmony_ci err = fq_change(sch, opt, extack); 96162306a36Sopenharmony_ci else 96262306a36Sopenharmony_ci err = fq_resize(sch, q->fq_trees_log); 96362306a36Sopenharmony_ci 96462306a36Sopenharmony_ci return err; 96562306a36Sopenharmony_ci} 96662306a36Sopenharmony_ci 96762306a36Sopenharmony_cistatic int fq_dump(struct Qdisc *sch, struct sk_buff *skb) 96862306a36Sopenharmony_ci{ 96962306a36Sopenharmony_ci struct fq_sched_data *q = qdisc_priv(sch); 97062306a36Sopenharmony_ci u64 ce_threshold = q->ce_threshold; 97162306a36Sopenharmony_ci u64 horizon = q->horizon; 97262306a36Sopenharmony_ci struct nlattr *opts; 97362306a36Sopenharmony_ci 97462306a36Sopenharmony_ci opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 97562306a36Sopenharmony_ci if (opts == NULL) 97662306a36Sopenharmony_ci goto nla_put_failure; 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_ci /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ 97962306a36Sopenharmony_ci 98062306a36Sopenharmony_ci do_div(ce_threshold, NSEC_PER_USEC); 98162306a36Sopenharmony_ci do_div(horizon, NSEC_PER_USEC); 98262306a36Sopenharmony_ci 98362306a36Sopenharmony_ci if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || 98462306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || 98562306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || 98662306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || 98762306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || 98862306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, 98962306a36Sopenharmony_ci min_t(unsigned long, q->flow_max_rate, ~0U)) || 99062306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, 99162306a36Sopenharmony_ci jiffies_to_usecs(q->flow_refill_delay)) || 99262306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || 99362306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, 99462306a36Sopenharmony_ci q->low_rate_threshold) || 99562306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || 99662306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || 99762306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || 99862306a36Sopenharmony_ci nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || 99962306a36Sopenharmony_ci nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) 100062306a36Sopenharmony_ci goto nla_put_failure; 100162306a36Sopenharmony_ci 100262306a36Sopenharmony_ci return nla_nest_end(skb, opts); 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_cinla_put_failure: 100562306a36Sopenharmony_ci return -1; 100662306a36Sopenharmony_ci} 100762306a36Sopenharmony_ci 100862306a36Sopenharmony_cistatic int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 100962306a36Sopenharmony_ci{ 101062306a36Sopenharmony_ci struct fq_sched_data *q = qdisc_priv(sch); 101162306a36Sopenharmony_ci struct tc_fq_qd_stats st; 101262306a36Sopenharmony_ci 101362306a36Sopenharmony_ci sch_tree_lock(sch); 101462306a36Sopenharmony_ci 101562306a36Sopenharmony_ci st.gc_flows = q->stat_gc_flows; 101662306a36Sopenharmony_ci st.highprio_packets = q->stat_internal_packets; 101762306a36Sopenharmony_ci st.tcp_retrans = 0; 101862306a36Sopenharmony_ci st.throttled = q->stat_throttled; 101962306a36Sopenharmony_ci st.flows_plimit = q->stat_flows_plimit; 102062306a36Sopenharmony_ci st.pkts_too_long = q->stat_pkts_too_long; 102162306a36Sopenharmony_ci st.allocation_errors = q->stat_allocation_errors; 102262306a36Sopenharmony_ci st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - 102362306a36Sopenharmony_ci ktime_get_ns(); 102462306a36Sopenharmony_ci st.flows = q->flows; 102562306a36Sopenharmony_ci st.inactive_flows = q->inactive_flows; 102662306a36Sopenharmony_ci st.throttled_flows = q->throttled_flows; 102762306a36Sopenharmony_ci st.unthrottle_latency_ns = min_t(unsigned long, 102862306a36Sopenharmony_ci q->unthrottle_latency_ns, ~0U); 102962306a36Sopenharmony_ci st.ce_mark = q->stat_ce_mark; 103062306a36Sopenharmony_ci st.horizon_drops = q->stat_horizon_drops; 103162306a36Sopenharmony_ci st.horizon_caps = q->stat_horizon_caps; 103262306a36Sopenharmony_ci sch_tree_unlock(sch); 103362306a36Sopenharmony_ci 103462306a36Sopenharmony_ci return gnet_stats_copy_app(d, &st, sizeof(st)); 103562306a36Sopenharmony_ci} 103662306a36Sopenharmony_ci 103762306a36Sopenharmony_cistatic struct Qdisc_ops fq_qdisc_ops __read_mostly = { 103862306a36Sopenharmony_ci .id = "fq", 103962306a36Sopenharmony_ci .priv_size = sizeof(struct fq_sched_data), 104062306a36Sopenharmony_ci 104162306a36Sopenharmony_ci .enqueue = fq_enqueue, 104262306a36Sopenharmony_ci .dequeue = fq_dequeue, 104362306a36Sopenharmony_ci .peek = qdisc_peek_dequeued, 104462306a36Sopenharmony_ci .init = fq_init, 104562306a36Sopenharmony_ci .reset = fq_reset, 104662306a36Sopenharmony_ci .destroy = fq_destroy, 104762306a36Sopenharmony_ci .change = fq_change, 104862306a36Sopenharmony_ci .dump = fq_dump, 104962306a36Sopenharmony_ci .dump_stats = fq_dump_stats, 105062306a36Sopenharmony_ci .owner = THIS_MODULE, 105162306a36Sopenharmony_ci}; 105262306a36Sopenharmony_ci 105362306a36Sopenharmony_cistatic int __init fq_module_init(void) 105462306a36Sopenharmony_ci{ 105562306a36Sopenharmony_ci int ret; 105662306a36Sopenharmony_ci 105762306a36Sopenharmony_ci fq_flow_cachep = kmem_cache_create("fq_flow_cache", 105862306a36Sopenharmony_ci sizeof(struct fq_flow), 105962306a36Sopenharmony_ci 0, 0, NULL); 106062306a36Sopenharmony_ci if (!fq_flow_cachep) 106162306a36Sopenharmony_ci return -ENOMEM; 106262306a36Sopenharmony_ci 106362306a36Sopenharmony_ci ret = register_qdisc(&fq_qdisc_ops); 106462306a36Sopenharmony_ci if (ret) 106562306a36Sopenharmony_ci kmem_cache_destroy(fq_flow_cachep); 106662306a36Sopenharmony_ci return ret; 106762306a36Sopenharmony_ci} 106862306a36Sopenharmony_ci 106962306a36Sopenharmony_cistatic void __exit fq_module_exit(void) 107062306a36Sopenharmony_ci{ 107162306a36Sopenharmony_ci unregister_qdisc(&fq_qdisc_ops); 107262306a36Sopenharmony_ci kmem_cache_destroy(fq_flow_cachep); 107362306a36Sopenharmony_ci} 107462306a36Sopenharmony_ci 107562306a36Sopenharmony_cimodule_init(fq_module_init) 107662306a36Sopenharmony_cimodule_exit(fq_module_exit) 107762306a36Sopenharmony_ciMODULE_AUTHOR("Eric Dumazet"); 107862306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 107962306a36Sopenharmony_ciMODULE_DESCRIPTION("Fair Queue Packet Scheduler"); 1080