18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* Copyright (c) 2019 Facebook 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * This program is free software; you can redistribute it and/or 58c2ecf20Sopenharmony_ci * modify it under the terms of version 2 of the GNU General Public 68c2ecf20Sopenharmony_ci * License as published by the Free Software Foundation. 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * Sample Host Bandwidth Manager (HBM) BPF program. 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * A cgroup skb BPF egress program to limit cgroup output bandwidth. 118c2ecf20Sopenharmony_ci * It uses a modified virtual token bucket queue to limit average 128c2ecf20Sopenharmony_ci * egress bandwidth. The implementation uses credits instead of tokens. 138c2ecf20Sopenharmony_ci * Negative credits imply that queueing would have happened (this is 148c2ecf20Sopenharmony_ci * a virtual queue, so no queueing is done by it. However, queueing may 158c2ecf20Sopenharmony_ci * occur at the actual qdisc (which is not used for rate limiting). 168c2ecf20Sopenharmony_ci * 178c2ecf20Sopenharmony_ci * This implementation uses 3 thresholds, one to start marking packets and 188c2ecf20Sopenharmony_ci * the other two to drop packets: 198c2ecf20Sopenharmony_ci * CREDIT 208c2ecf20Sopenharmony_ci * - <--------------------------|------------------------> + 218c2ecf20Sopenharmony_ci * | | | 0 228c2ecf20Sopenharmony_ci * | Large pkt | 238c2ecf20Sopenharmony_ci * | drop thresh | 248c2ecf20Sopenharmony_ci * Small pkt drop Mark threshold 258c2ecf20Sopenharmony_ci * thresh 268c2ecf20Sopenharmony_ci * 278c2ecf20Sopenharmony_ci * The effect of marking depends on the type of packet: 288c2ecf20Sopenharmony_ci * a) If the packet is ECN enabled and it is a TCP packet, then the packet 298c2ecf20Sopenharmony_ci * is ECN marked. 308c2ecf20Sopenharmony_ci * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr 318c2ecf20Sopenharmony_ci * to reduce the congestion window. The current implementation uses a linear 328c2ecf20Sopenharmony_ci * distribution (0% probability at marking threshold, 100% probability 338c2ecf20Sopenharmony_ci * at drop threshold). 348c2ecf20Sopenharmony_ci * c) If the packet is not a TCP packet, then it is dropped. 358c2ecf20Sopenharmony_ci * 368c2ecf20Sopenharmony_ci * If the credit is below the drop threshold, the packet is dropped. If it 378c2ecf20Sopenharmony_ci * is a TCP packet, then it also calls tcp_cwr since packets dropped by 388c2ecf20Sopenharmony_ci * by a cgroup skb BPF program do not automatically trigger a call to 398c2ecf20Sopenharmony_ci * tcp_cwr in the current kernel code. 408c2ecf20Sopenharmony_ci * 418c2ecf20Sopenharmony_ci * This BPF program actually uses 2 drop thresholds, one threshold 428c2ecf20Sopenharmony_ci * for larger packets (>= 120 bytes) and another for smaller packets. This 438c2ecf20Sopenharmony_ci * protects smaller packets such as SYNs, ACKs, etc. 448c2ecf20Sopenharmony_ci * 458c2ecf20Sopenharmony_ci * The default bandwidth limit is set at 1Gbps but this can be changed by 468c2ecf20Sopenharmony_ci * a user program through a shared BPF map. In addition, by default this BPF 478c2ecf20Sopenharmony_ci * program does not limit connections using loopback. This behavior can be 488c2ecf20Sopenharmony_ci * overwritten by the user program. There is also an option to calculate 498c2ecf20Sopenharmony_ci * some statistics, such as percent of packets marked or dropped, which 508c2ecf20Sopenharmony_ci * a user program, such as hbm, can access. 518c2ecf20Sopenharmony_ci */ 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci#include "hbm_kern.h" 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ciSEC("cgroup_skb/egress") 568c2ecf20Sopenharmony_ciint _hbm_out_cg(struct __sk_buff *skb) 578c2ecf20Sopenharmony_ci{ 588c2ecf20Sopenharmony_ci long long delta = 0, delta_send; 598c2ecf20Sopenharmony_ci unsigned long long curtime, sendtime; 608c2ecf20Sopenharmony_ci struct hbm_queue_stats *qsp = NULL; 618c2ecf20Sopenharmony_ci unsigned int queue_index = 0; 628c2ecf20Sopenharmony_ci bool congestion_flag = false; 638c2ecf20Sopenharmony_ci bool ecn_ce_flag = false; 648c2ecf20Sopenharmony_ci struct hbm_pkt_info pkti = {}; 658c2ecf20Sopenharmony_ci struct hbm_vqueue *qdp; 668c2ecf20Sopenharmony_ci bool drop_flag = false; 678c2ecf20Sopenharmony_ci bool cwr_flag = false; 688c2ecf20Sopenharmony_ci int len = skb->len; 698c2ecf20Sopenharmony_ci int rv = ALLOW_PKT; 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_ci // Check if we should ignore loopback traffic 748c2ecf20Sopenharmony_ci if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) 758c2ecf20Sopenharmony_ci return ALLOW_PKT; 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci hbm_get_pkt_info(skb, &pkti); 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci // We may want to account for the length of headers in len 808c2ecf20Sopenharmony_ci // calculation, like ETH header + overhead, specially if it 818c2ecf20Sopenharmony_ci // is a gso packet. But I am not doing it right now. 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_ci qdp = bpf_get_local_storage(&queue_state, 0); 848c2ecf20Sopenharmony_ci if (!qdp) 858c2ecf20Sopenharmony_ci return ALLOW_PKT; 868c2ecf20Sopenharmony_ci if (qdp->lasttime == 0) 878c2ecf20Sopenharmony_ci hbm_init_edt_vqueue(qdp, 1024); 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci curtime = bpf_ktime_get_ns(); 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci // Begin critical section 928c2ecf20Sopenharmony_ci bpf_spin_lock(&qdp->lock); 938c2ecf20Sopenharmony_ci delta = qdp->lasttime - curtime; 948c2ecf20Sopenharmony_ci // bound bursts to 100us 958c2ecf20Sopenharmony_ci if (delta < -BURST_SIZE_NS) { 968c2ecf20Sopenharmony_ci // negative delta is a credit that allows bursts 978c2ecf20Sopenharmony_ci qdp->lasttime = curtime - BURST_SIZE_NS; 988c2ecf20Sopenharmony_ci delta = -BURST_SIZE_NS; 998c2ecf20Sopenharmony_ci } 1008c2ecf20Sopenharmony_ci sendtime = qdp->lasttime; 1018c2ecf20Sopenharmony_ci delta_send = BYTES_TO_NS(len, qdp->rate); 1028c2ecf20Sopenharmony_ci __sync_add_and_fetch(&(qdp->lasttime), delta_send); 1038c2ecf20Sopenharmony_ci bpf_spin_unlock(&qdp->lock); 1048c2ecf20Sopenharmony_ci // End critical section 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci // Set EDT of packet 1078c2ecf20Sopenharmony_ci skb->tstamp = sendtime; 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci // Check if we should update rate 1108c2ecf20Sopenharmony_ci if (qsp != NULL && (qsp->rate * 128) != qdp->rate) 1118c2ecf20Sopenharmony_ci qdp->rate = qsp->rate * 128; 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci // Set flags (drop, congestion, cwr) 1148c2ecf20Sopenharmony_ci // last packet will be sent in the future, bound latency 1158c2ecf20Sopenharmony_ci if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS && 1168c2ecf20Sopenharmony_ci len > LARGE_PKT_THRESH)) { 1178c2ecf20Sopenharmony_ci drop_flag = true; 1188c2ecf20Sopenharmony_ci if (pkti.is_tcp && pkti.ecn == 0) 1198c2ecf20Sopenharmony_ci cwr_flag = true; 1208c2ecf20Sopenharmony_ci } else if (delta > MARK_THRESH_NS) { 1218c2ecf20Sopenharmony_ci if (pkti.is_tcp) 1228c2ecf20Sopenharmony_ci congestion_flag = true; 1238c2ecf20Sopenharmony_ci else 1248c2ecf20Sopenharmony_ci drop_flag = true; 1258c2ecf20Sopenharmony_ci } 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci if (congestion_flag) { 1288c2ecf20Sopenharmony_ci if (bpf_skb_ecn_set_ce(skb)) { 1298c2ecf20Sopenharmony_ci ecn_ce_flag = true; 1308c2ecf20Sopenharmony_ci } else { 1318c2ecf20Sopenharmony_ci if (pkti.is_tcp) { 1328c2ecf20Sopenharmony_ci unsigned int rand = bpf_get_prandom_u32(); 1338c2ecf20Sopenharmony_ci 1348c2ecf20Sopenharmony_ci if (delta >= MARK_THRESH_NS + 1358c2ecf20Sopenharmony_ci (rand % MARK_REGION_SIZE_NS)) { 1368c2ecf20Sopenharmony_ci // Do congestion control 1378c2ecf20Sopenharmony_ci cwr_flag = true; 1388c2ecf20Sopenharmony_ci } 1398c2ecf20Sopenharmony_ci } else if (len > LARGE_PKT_THRESH) { 1408c2ecf20Sopenharmony_ci // Problem if too many small packets? 1418c2ecf20Sopenharmony_ci drop_flag = true; 1428c2ecf20Sopenharmony_ci congestion_flag = false; 1438c2ecf20Sopenharmony_ci } 1448c2ecf20Sopenharmony_ci } 1458c2ecf20Sopenharmony_ci } 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) { 1488c2ecf20Sopenharmony_ci drop_flag = false; 1498c2ecf20Sopenharmony_ci cwr_flag = true; 1508c2ecf20Sopenharmony_ci congestion_flag = false; 1518c2ecf20Sopenharmony_ci } 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci if (qsp != NULL && qsp->no_cn) 1548c2ecf20Sopenharmony_ci cwr_flag = false; 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, 1578c2ecf20Sopenharmony_ci cwr_flag, ecn_ce_flag, &pkti, (int) delta); 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_ci if (drop_flag) { 1608c2ecf20Sopenharmony_ci __sync_add_and_fetch(&(qdp->lasttime), -delta_send); 1618c2ecf20Sopenharmony_ci rv = DROP_PKT; 1628c2ecf20Sopenharmony_ci } 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci if (cwr_flag) 1658c2ecf20Sopenharmony_ci rv |= CWR; 1668c2ecf20Sopenharmony_ci return rv; 1678c2ecf20Sopenharmony_ci} 1688c2ecf20Sopenharmony_cichar _license[] SEC("license") = "GPL"; 169