1// SPDX-License-Identifier: GPL-2.0 2/* Copyright (c) 2019 Facebook 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 * 8 * Example program for Host Bandwidth Managment 9 * 10 * This program loads a cgroup skb BPF program to enforce cgroup output 11 * (egress) or input (ingress) bandwidth limits. 12 * 13 * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] 14 * Where: 15 * -d Print BPF trace debug buffer 16 * -l Also limit flows doing loopback 17 * -n <#> To create cgroup \"/hbm#\" and attach prog 18 * Default is /hbm1 19 * --no_cn Do not return cn notifications 20 * -r <rate> Rate limit in Mbps 21 * -s Get HBM stats (marked, dropped, etc.) 22 * -t <time> Exit after specified seconds (default is 0) 23 * -w Work conserving flag. cgroup can increase its bandwidth 24 * beyond the rate limit specified while there is available 25 * bandwidth. Current implementation assumes there is only 26 * NIC (eth0), but can be extended to support multiple NICs. 27 * Currrently only supported for egress. 28 * -h Print this info 29 * prog BPF program file name. Name defaults to hbm_out_kern.o 30 */ 31 32#define _GNU_SOURCE 33 34#include <stdio.h> 35#include <stdlib.h> 36#include <assert.h> 37#include <sys/resource.h> 38#include <sys/time.h> 39#include <unistd.h> 40#include <errno.h> 41#include <fcntl.h> 42#include <linux/unistd.h> 43#include <linux/compiler.h> 44 45#include <linux/bpf.h> 46#include <bpf/bpf.h> 47#include <getopt.h> 48 49#include "bpf_load.h" 50#include "bpf_rlimit.h" 51#include "cgroup_helpers.h" 52#include "hbm.h" 53#include "bpf_util.h" 54#include <bpf/bpf.h> 55#include <bpf/libbpf.h> 56 57bool outFlag = true; 58int minRate = 1000; /* cgroup rate limit in Mbps */ 59int rate = 1000; /* can grow if rate conserving is enabled */ 60int dur = 1; 61bool stats_flag; 62bool loopback_flag; 63bool debugFlag; 64bool work_conserving_flag; 65bool no_cn_flag; 66bool edt_flag; 67 68static void Usage(void); 69static void read_trace_pipe2(void); 70static void do_error(char *msg, bool errno_flag); 71 72#define DEBUGFS "/sys/kernel/debug/tracing/" 73 74struct bpf_object *obj; 75int bpfprog_fd; 76int cgroup_storage_fd; 77 78static void read_trace_pipe2(void) 79{ 80 int trace_fd; 81 FILE *outf; 82 char *outFname = "hbm_out.log"; 83 84 trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); 85 if (trace_fd < 0) { 86 printf("Error opening trace_pipe\n"); 87 return; 88 } 89 90// Future support of ingress 91// if (!outFlag) 92// outFname = "hbm_in.log"; 93 outf = fopen(outFname, "w"); 94 95 if (outf == NULL) 96 printf("Error creating %s\n", outFname); 97 98 while (1) { 99 static char buf[4097]; 100 ssize_t sz; 101 102 sz = read(trace_fd, buf, sizeof(buf) - 1); 103 if (sz > 0) { 104 buf[sz] = 0; 105 puts(buf); 106 if (outf != NULL) { 107 fprintf(outf, "%s\n", buf); 108 fflush(outf); 109 } 110 } 111 } 112} 113 114static void do_error(char *msg, bool errno_flag) 115{ 116 if (errno_flag) 117 printf("ERROR: %s, errno: %d\n", msg, errno); 118 else 119 printf("ERROR: %s\n", msg); 120 exit(1); 121} 122 123static int prog_load(char *prog) 124{ 125 struct bpf_prog_load_attr prog_load_attr = { 126 .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 127 .file = prog, 128 .expected_attach_type = BPF_CGROUP_INET_EGRESS, 129 }; 130 int map_fd; 131 struct bpf_map *map; 132 133 int ret = 0; 134 135 if (access(prog, O_RDONLY) < 0) { 136 printf("Error accessing file %s: %s\n", prog, strerror(errno)); 137 return 1; 138 } 139 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) 140 ret = 1; 141 if (!ret) { 142 map = bpf_object__find_map_by_name(obj, "queue_stats"); 143 map_fd = bpf_map__fd(map); 144 if (map_fd < 0) { 145 printf("Map not found: %s\n", strerror(map_fd)); 146 ret = 1; 147 } 148 } 149 150 if (ret) { 151 printf("ERROR: bpf_prog_load_xattr failed for: %s\n", prog); 152 printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); 153 ret = -1; 154 } else { 155 ret = map_fd; 156 } 157 158 return ret; 159} 160 161static int run_bpf_prog(char *prog, int cg_id) 162{ 163 int map_fd; 164 int rc = 0; 165 int key = 0; 166 int cg1 = 0; 167 int type = BPF_CGROUP_INET_EGRESS; 168 char cg_dir[100]; 169 struct hbm_queue_stats qstats = {0}; 170 171 sprintf(cg_dir, "/hbm%d", cg_id); 172 map_fd = prog_load(prog); 173 if (map_fd == -1) 174 return 1; 175 176 if (setup_cgroup_environment()) { 177 printf("ERROR: setting cgroup environment\n"); 178 goto err; 179 } 180 cg1 = create_and_get_cgroup(cg_dir); 181 if (!cg1) { 182 printf("ERROR: create_and_get_cgroup\n"); 183 goto err; 184 } 185 if (join_cgroup(cg_dir)) { 186 printf("ERROR: join_cgroup\n"); 187 goto err; 188 } 189 190 qstats.rate = rate; 191 qstats.stats = stats_flag ? 1 : 0; 192 qstats.loopback = loopback_flag ? 1 : 0; 193 qstats.no_cn = no_cn_flag ? 1 : 0; 194 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { 195 printf("ERROR: Could not update map element\n"); 196 goto err; 197 } 198 199 if (!outFlag) 200 type = BPF_CGROUP_INET_INGRESS; 201 if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { 202 printf("ERROR: bpf_prog_attach fails!\n"); 203 log_err("Attaching prog"); 204 goto err; 205 } 206 207 if (work_conserving_flag) { 208 struct timeval t0, t_last, t_new; 209 FILE *fin; 210 unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; 211 signed long long last_cg_tx_bytes, new_cg_tx_bytes; 212 signed long long delta_time, delta_bytes, delta_rate; 213 int delta_ms; 214#define DELTA_RATE_CHECK 10000 /* in us */ 215#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ 216 217 bpf_map_lookup_elem(map_fd, &key, &qstats); 218 if (gettimeofday(&t0, NULL) < 0) 219 do_error("gettimeofday failed", true); 220 t_last = t0; 221 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); 222 if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) 223 do_error("fscanf fails", false); 224 fclose(fin); 225 last_cg_tx_bytes = qstats.bytes_total; 226 while (true) { 227 usleep(DELTA_RATE_CHECK); 228 if (gettimeofday(&t_new, NULL) < 0) 229 do_error("gettimeofday failed", true); 230 delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + 231 (t_new.tv_usec - t0.tv_usec)/1000; 232 if (delta_ms > dur * 1000) 233 break; 234 delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + 235 (t_new.tv_usec - t_last.tv_usec); 236 if (delta_time == 0) 237 continue; 238 t_last = t_new; 239 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", 240 "r"); 241 if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) 242 do_error("fscanf fails", false); 243 fclose(fin); 244 printf(" new_eth_tx_bytes:%llu\n", 245 new_eth_tx_bytes); 246 bpf_map_lookup_elem(map_fd, &key, &qstats); 247 new_cg_tx_bytes = qstats.bytes_total; 248 delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; 249 last_eth_tx_bytes = new_eth_tx_bytes; 250 delta_rate = (delta_bytes * 8000000) / delta_time; 251 printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", 252 delta_ms, delta_rate/1000000000.0, 253 rate/1000.0); 254 if (delta_rate < RATE_THRESHOLD) { 255 /* can increase cgroup rate limit, but first 256 * check if we are using the current limit. 257 * Currently increasing by 6.25%, unknown 258 * if that is the optimal rate. 259 */ 260 int rate_diff100; 261 262 delta_bytes = new_cg_tx_bytes - 263 last_cg_tx_bytes; 264 last_cg_tx_bytes = new_cg_tx_bytes; 265 delta_rate = (delta_bytes * 8000000) / 266 delta_time; 267 printf(" rate:%.3fGbps", 268 delta_rate/1000000000.0); 269 rate_diff100 = (((long long)rate)*1000000 - 270 delta_rate) * 100 / 271 (((long long) rate) * 1000000); 272 printf(" rdiff:%d", rate_diff100); 273 if (rate_diff100 <= 3) { 274 rate += (rate >> 4); 275 if (rate > RATE_THRESHOLD / 1000000) 276 rate = RATE_THRESHOLD / 1000000; 277 qstats.rate = rate; 278 printf(" INC\n"); 279 } else { 280 printf("\n"); 281 } 282 } else { 283 /* Need to decrease cgroup rate limit. 284 * Currently decreasing by 12.5%, unknown 285 * if that is optimal 286 */ 287 printf(" DEC\n"); 288 rate -= (rate >> 3); 289 if (rate < minRate) 290 rate = minRate; 291 qstats.rate = rate; 292 } 293 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) 294 do_error("update map element fails", false); 295 } 296 } else { 297 sleep(dur); 298 } 299 // Get stats! 300 if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { 301 char fname[100]; 302 FILE *fout; 303 304 if (!outFlag) 305 sprintf(fname, "hbm.%d.in", cg_id); 306 else 307 sprintf(fname, "hbm.%d.out", cg_id); 308 fout = fopen(fname, "w"); 309 fprintf(fout, "id:%d\n", cg_id); 310 fprintf(fout, "ERROR: Could not lookup queue_stats\n"); 311 fclose(fout); 312 } else if (stats_flag && qstats.lastPacketTime > 313 qstats.firstPacketTime) { 314 long long delta_us = (qstats.lastPacketTime - 315 qstats.firstPacketTime)/1000; 316 unsigned int rate_mbps = ((qstats.bytes_total - 317 qstats.bytes_dropped) * 8 / 318 delta_us); 319 double percent_pkts, percent_bytes; 320 char fname[100]; 321 FILE *fout; 322 int k; 323 static const char *returnValNames[] = { 324 "DROP_PKT", 325 "ALLOW_PKT", 326 "DROP_PKT_CWR", 327 "ALLOW_PKT_CWR" 328 }; 329#define RET_VAL_COUNT 4 330 331// Future support of ingress 332// if (!outFlag) 333// sprintf(fname, "hbm.%d.in", cg_id); 334// else 335 sprintf(fname, "hbm.%d.out", cg_id); 336 fout = fopen(fname, "w"); 337 fprintf(fout, "id:%d\n", cg_id); 338 fprintf(fout, "rate_mbps:%d\n", rate_mbps); 339 fprintf(fout, "duration:%.1f secs\n", 340 (qstats.lastPacketTime - qstats.firstPacketTime) / 341 1000000000.0); 342 fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); 343 fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / 344 1000000)); 345 fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); 346 fprintf(fout, "bytes_dropped_MB:%d\n", 347 (int)(qstats.bytes_dropped / 348 1000000)); 349 // Marked Pkts and Bytes 350 percent_pkts = (qstats.pkts_marked * 100.0) / 351 (qstats.pkts_total + 1); 352 percent_bytes = (qstats.bytes_marked * 100.0) / 353 (qstats.bytes_total + 1); 354 fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); 355 fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); 356 357 // Dropped Pkts and Bytes 358 percent_pkts = (qstats.pkts_dropped * 100.0) / 359 (qstats.pkts_total + 1); 360 percent_bytes = (qstats.bytes_dropped * 100.0) / 361 (qstats.bytes_total + 1); 362 fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); 363 fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); 364 365 // ECN CE markings 366 percent_pkts = (qstats.pkts_ecn_ce * 100.0) / 367 (qstats.pkts_total + 1); 368 fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, 369 (int)qstats.pkts_ecn_ce); 370 371 // Average cwnd 372 fprintf(fout, "avg cwnd:%d\n", 373 (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); 374 // Average rtt 375 fprintf(fout, "avg rtt:%d\n", 376 (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); 377 // Average credit 378 if (edt_flag) 379 fprintf(fout, "avg credit_ms:%.03f\n", 380 (qstats.sum_credit / 381 (qstats.pkts_total + 1.0)) / 1000000.0); 382 else 383 fprintf(fout, "avg credit:%d\n", 384 (int)(qstats.sum_credit / 385 (1500 * ((int)qstats.pkts_total ) + 1))); 386 387 // Return values stats 388 for (k = 0; k < RET_VAL_COUNT; k++) { 389 percent_pkts = (qstats.returnValCount[k] * 100.0) / 390 (qstats.pkts_total + 1); 391 fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], 392 percent_pkts, (int)qstats.returnValCount[k]); 393 } 394 fclose(fout); 395 } 396 397 if (debugFlag) 398 read_trace_pipe2(); 399 return rc; 400err: 401 rc = 1; 402 403 if (cg1) 404 close(cg1); 405 cleanup_cgroup_environment(); 406 407 return rc; 408} 409 410static void Usage(void) 411{ 412 printf("This program loads a cgroup skb BPF program to enforce\n" 413 "cgroup output (egress) bandwidth limits.\n\n" 414 "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" 415 " [-s] [-t <secs>] [-w] [-h] [prog]\n" 416 " Where:\n" 417 " -o indicates egress direction (default)\n" 418 " -d print BPF trace debug buffer\n" 419 " --edt use fq's Earliest Departure Time\n" 420 " -l also limit flows using loopback\n" 421 " -n <#> to create cgroup \"/hbm#\" and attach prog\n" 422 " Default is /hbm1\n" 423 " --no_cn disable CN notifications\n" 424 " -r <rate> Rate in Mbps\n" 425 " -s Update HBM stats\n" 426 " -t <time> Exit after specified seconds (default is 0)\n" 427 " -w Work conserving flag. cgroup can increase\n" 428 " bandwidth beyond the rate limit specified\n" 429 " while there is available bandwidth. Current\n" 430 " implementation assumes there is only eth0\n" 431 " but can be extended to support multiple NICs\n" 432 " -h print this info\n" 433 " prog BPF program file name. Name defaults to\n" 434 " hbm_out_kern.o\n"); 435} 436 437int main(int argc, char **argv) 438{ 439 char *prog = "hbm_out_kern.o"; 440 int k; 441 int cg_id = 1; 442 char *optstring = "iodln:r:st:wh"; 443 struct option loptions[] = { 444 {"no_cn", 0, NULL, 1}, 445 {"edt", 0, NULL, 2}, 446 {NULL, 0, NULL, 0} 447 }; 448 449 while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { 450 switch (k) { 451 case 1: 452 no_cn_flag = true; 453 break; 454 case 2: 455 prog = "hbm_edt_kern.o"; 456 edt_flag = true; 457 break; 458 case'o': 459 break; 460 case 'd': 461 debugFlag = true; 462 break; 463 case 'l': 464 loopback_flag = true; 465 break; 466 case 'n': 467 cg_id = atoi(optarg); 468 break; 469 case 'r': 470 minRate = atoi(optarg) * 1.024; 471 rate = minRate; 472 break; 473 case 's': 474 stats_flag = true; 475 break; 476 case 't': 477 dur = atoi(optarg); 478 break; 479 case 'w': 480 work_conserving_flag = true; 481 break; 482 case '?': 483 if (optopt == 'n' || optopt == 'r' || optopt == 't') 484 fprintf(stderr, 485 "Option -%c requires an argument.\n\n", 486 optopt); 487 case 'h': 488 __fallthrough; 489 default: 490 Usage(); 491 return 0; 492 } 493 } 494 495 if (optind < argc) 496 prog = argv[optind]; 497 printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); 498 499 return run_bpf_prog(prog, cg_id); 500} 501