1// SPDX-License-Identifier: GPL-2.0 2 3/* net/sched/sch_taprio.c Time Aware Priority Scheduler 4 * 5 * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> 6 * 7 */ 8 9#include <linux/types.h> 10#include <linux/slab.h> 11#include <linux/kernel.h> 12#include <linux/string.h> 13#include <linux/list.h> 14#include <linux/errno.h> 15#include <linux/skbuff.h> 16#include <linux/math64.h> 17#include <linux/module.h> 18#include <linux/spinlock.h> 19#include <linux/rcupdate.h> 20#include <net/netlink.h> 21#include <net/pkt_sched.h> 22#include <net/pkt_cls.h> 23#include <net/sch_generic.h> 24#include <net/sock.h> 25#include <net/tcp.h> 26 27static LIST_HEAD(taprio_list); 28static DEFINE_SPINLOCK(taprio_list_lock); 29 30#define TAPRIO_ALL_GATES_OPEN -1 31 32#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) 33#define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) 34#define TAPRIO_FLAGS_INVALID U32_MAX 35 36struct sched_entry { 37 struct list_head list; 38 39 /* The instant that this entry "closes" and the next one 40 * should open, the qdisc will make some effort so that no 41 * packet leaves after this time. 42 */ 43 ktime_t close_time; 44 ktime_t next_txtime; 45 atomic_t budget; 46 int index; 47 u32 gate_mask; 48 u32 interval; 49 u8 command; 50}; 51 52struct sched_gate_list { 53 struct rcu_head rcu; 54 struct list_head entries; 55 size_t num_entries; 56 ktime_t cycle_close_time; 57 s64 cycle_time; 58 s64 cycle_time_extension; 59 s64 base_time; 60}; 61 62struct taprio_sched { 63 struct Qdisc **qdiscs; 64 struct Qdisc *root; 65 u32 flags; 66 enum tk_offsets tk_offset; 67 int clockid; 68 bool offloaded; 69 atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ 70 * speeds it's sub-nanoseconds per byte 71 */ 72 73 /* Protects the update side of the RCU protected current_entry */ 74 spinlock_t current_entry_lock; 75 struct sched_entry __rcu *current_entry; 76 struct sched_gate_list __rcu *oper_sched; 77 struct sched_gate_list __rcu *admin_sched; 78 struct hrtimer advance_timer; 79 struct list_head taprio_list; 80 struct sk_buff *(*dequeue)(struct Qdisc *sch); 81 struct sk_buff *(*peek)(struct Qdisc *sch); 82 u32 txtime_delay; 83}; 84 85struct __tc_taprio_qopt_offload { 86 refcount_t users; 87 struct tc_taprio_qopt_offload offload; 88}; 89 90static ktime_t sched_base_time(const struct sched_gate_list *sched) 91{ 92 if (!sched) 93 return KTIME_MAX; 94 95 return ns_to_ktime(sched->base_time); 96} 97 98static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) 99{ 100 /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ 101 enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); 102 103 switch (tk_offset) { 104 case TK_OFFS_MAX: 105 return mono; 106 default: 107 return ktime_mono_to_any(mono, tk_offset); 108 } 109} 110 111static ktime_t taprio_get_time(const struct taprio_sched *q) 112{ 113 return taprio_mono_to_any(q, ktime_get()); 114} 115 116static void taprio_free_sched_cb(struct rcu_head *head) 117{ 118 struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); 119 struct sched_entry *entry, *n; 120 121 if (!sched) 122 return; 123 124 list_for_each_entry_safe(entry, n, &sched->entries, list) { 125 list_del(&entry->list); 126 kfree(entry); 127 } 128 129 kfree(sched); 130} 131 132static void switch_schedules(struct taprio_sched *q, 133 struct sched_gate_list **admin, 134 struct sched_gate_list **oper) 135{ 136 rcu_assign_pointer(q->oper_sched, *admin); 137 rcu_assign_pointer(q->admin_sched, NULL); 138 139 if (*oper) 140 call_rcu(&(*oper)->rcu, taprio_free_sched_cb); 141 142 *oper = *admin; 143 *admin = NULL; 144} 145 146/* Get how much time has been already elapsed in the current cycle. */ 147static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) 148{ 149 ktime_t time_since_sched_start; 150 s32 time_elapsed; 151 152 time_since_sched_start = ktime_sub(time, sched->base_time); 153 div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); 154 155 return time_elapsed; 156} 157 158static ktime_t get_interval_end_time(struct sched_gate_list *sched, 159 struct sched_gate_list *admin, 160 struct sched_entry *entry, 161 ktime_t intv_start) 162{ 163 s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); 164 ktime_t intv_end, cycle_ext_end, cycle_end; 165 166 cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); 167 intv_end = ktime_add_ns(intv_start, entry->interval); 168 cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); 169 170 if (ktime_before(intv_end, cycle_end)) 171 return intv_end; 172 else if (admin && admin != sched && 173 ktime_after(admin->base_time, cycle_end) && 174 ktime_before(admin->base_time, cycle_ext_end)) 175 return admin->base_time; 176 else 177 return cycle_end; 178} 179 180static int length_to_duration(struct taprio_sched *q, int len) 181{ 182 return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); 183} 184 185/* Returns the entry corresponding to next available interval. If 186 * validate_interval is set, it only validates whether the timestamp occurs 187 * when the gate corresponding to the skb's traffic class is open. 188 */ 189static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, 190 struct Qdisc *sch, 191 struct sched_gate_list *sched, 192 struct sched_gate_list *admin, 193 ktime_t time, 194 ktime_t *interval_start, 195 ktime_t *interval_end, 196 bool validate_interval) 197{ 198 ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; 199 ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; 200 struct sched_entry *entry = NULL, *entry_found = NULL; 201 struct taprio_sched *q = qdisc_priv(sch); 202 struct net_device *dev = qdisc_dev(sch); 203 bool entry_available = false; 204 s32 cycle_elapsed; 205 int tc, n; 206 207 tc = netdev_get_prio_tc_map(dev, skb->priority); 208 packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); 209 210 *interval_start = 0; 211 *interval_end = 0; 212 213 if (!sched) 214 return NULL; 215 216 cycle = sched->cycle_time; 217 cycle_elapsed = get_cycle_time_elapsed(sched, time); 218 curr_intv_end = ktime_sub_ns(time, cycle_elapsed); 219 cycle_end = ktime_add_ns(curr_intv_end, cycle); 220 221 list_for_each_entry(entry, &sched->entries, list) { 222 curr_intv_start = curr_intv_end; 223 curr_intv_end = get_interval_end_time(sched, admin, entry, 224 curr_intv_start); 225 226 if (ktime_after(curr_intv_start, cycle_end)) 227 break; 228 229 if (!(entry->gate_mask & BIT(tc)) || 230 packet_transmit_time > entry->interval) 231 continue; 232 233 txtime = entry->next_txtime; 234 235 if (ktime_before(txtime, time) || validate_interval) { 236 transmit_end_time = ktime_add_ns(time, packet_transmit_time); 237 if ((ktime_before(curr_intv_start, time) && 238 ktime_before(transmit_end_time, curr_intv_end)) || 239 (ktime_after(curr_intv_start, time) && !validate_interval)) { 240 entry_found = entry; 241 *interval_start = curr_intv_start; 242 *interval_end = curr_intv_end; 243 break; 244 } else if (!entry_available && !validate_interval) { 245 /* Here, we are just trying to find out the 246 * first available interval in the next cycle. 247 */ 248 entry_available = 1; 249 entry_found = entry; 250 *interval_start = ktime_add_ns(curr_intv_start, cycle); 251 *interval_end = ktime_add_ns(curr_intv_end, cycle); 252 } 253 } else if (ktime_before(txtime, earliest_txtime) && 254 !entry_available) { 255 earliest_txtime = txtime; 256 entry_found = entry; 257 n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); 258 *interval_start = ktime_add(curr_intv_start, n * cycle); 259 *interval_end = ktime_add(curr_intv_end, n * cycle); 260 } 261 } 262 263 return entry_found; 264} 265 266static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) 267{ 268 struct taprio_sched *q = qdisc_priv(sch); 269 struct sched_gate_list *sched, *admin; 270 ktime_t interval_start, interval_end; 271 struct sched_entry *entry; 272 273 rcu_read_lock(); 274 sched = rcu_dereference(q->oper_sched); 275 admin = rcu_dereference(q->admin_sched); 276 277 entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, 278 &interval_start, &interval_end, true); 279 rcu_read_unlock(); 280 281 return entry; 282} 283 284static bool taprio_flags_valid(u32 flags) 285{ 286 /* Make sure no other flag bits are set. */ 287 if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | 288 TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) 289 return false; 290 /* txtime-assist and full offload are mutually exclusive */ 291 if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && 292 (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) 293 return false; 294 return true; 295} 296 297/* This returns the tstamp value set by TCP in terms of the set clock. */ 298static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) 299{ 300 unsigned int offset = skb_network_offset(skb); 301 const struct ipv6hdr *ipv6h; 302 const struct iphdr *iph; 303 struct ipv6hdr _ipv6h; 304 305 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 306 if (!ipv6h) 307 return 0; 308 309 if (ipv6h->version == 4) { 310 iph = (struct iphdr *)ipv6h; 311 offset += iph->ihl * 4; 312 313 /* special-case 6in4 tunnelling, as that is a common way to get 314 * v6 connectivity in the home 315 */ 316 if (iph->protocol == IPPROTO_IPV6) { 317 ipv6h = skb_header_pointer(skb, offset, 318 sizeof(_ipv6h), &_ipv6h); 319 320 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 321 return 0; 322 } else if (iph->protocol != IPPROTO_TCP) { 323 return 0; 324 } 325 } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { 326 return 0; 327 } 328 329 return taprio_mono_to_any(q, skb->skb_mstamp_ns); 330} 331 332/* There are a few scenarios where we will have to modify the txtime from 333 * what is read from next_txtime in sched_entry. They are: 334 * 1. If txtime is in the past, 335 * a. The gate for the traffic class is currently open and packet can be 336 * transmitted before it closes, schedule the packet right away. 337 * b. If the gate corresponding to the traffic class is going to open later 338 * in the cycle, set the txtime of packet to the interval start. 339 * 2. If txtime is in the future, there are packets corresponding to the 340 * current traffic class waiting to be transmitted. So, the following 341 * possibilities exist: 342 * a. We can transmit the packet before the window containing the txtime 343 * closes. 344 * b. The window might close before the transmission can be completed 345 * successfully. So, schedule the packet in the next open window. 346 */ 347static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) 348{ 349 ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; 350 struct taprio_sched *q = qdisc_priv(sch); 351 struct sched_gate_list *sched, *admin; 352 ktime_t minimum_time, now, txtime; 353 int len, packet_transmit_time; 354 struct sched_entry *entry; 355 bool sched_changed; 356 357 now = taprio_get_time(q); 358 minimum_time = ktime_add_ns(now, q->txtime_delay); 359 360 tcp_tstamp = get_tcp_tstamp(q, skb); 361 minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); 362 363 rcu_read_lock(); 364 admin = rcu_dereference(q->admin_sched); 365 sched = rcu_dereference(q->oper_sched); 366 if (admin && ktime_after(minimum_time, admin->base_time)) 367 switch_schedules(q, &admin, &sched); 368 369 /* Until the schedule starts, all the queues are open */ 370 if (!sched || ktime_before(minimum_time, sched->base_time)) { 371 txtime = minimum_time; 372 goto done; 373 } 374 375 len = qdisc_pkt_len(skb); 376 packet_transmit_time = length_to_duration(q, len); 377 378 do { 379 sched_changed = 0; 380 381 entry = find_entry_to_transmit(skb, sch, sched, admin, 382 minimum_time, 383 &interval_start, &interval_end, 384 false); 385 if (!entry) { 386 txtime = 0; 387 goto done; 388 } 389 390 txtime = entry->next_txtime; 391 txtime = max_t(ktime_t, txtime, minimum_time); 392 txtime = max_t(ktime_t, txtime, interval_start); 393 394 if (admin && admin != sched && 395 ktime_after(txtime, admin->base_time)) { 396 sched = admin; 397 sched_changed = 1; 398 continue; 399 } 400 401 transmit_end_time = ktime_add(txtime, packet_transmit_time); 402 minimum_time = transmit_end_time; 403 404 /* Update the txtime of current entry to the next time it's 405 * interval starts. 406 */ 407 if (ktime_after(transmit_end_time, interval_end)) 408 entry->next_txtime = ktime_add(interval_start, sched->cycle_time); 409 } while (sched_changed || ktime_after(transmit_end_time, interval_end)); 410 411 entry->next_txtime = transmit_end_time; 412 413done: 414 rcu_read_unlock(); 415 return txtime; 416} 417 418static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, 419 struct sk_buff **to_free) 420{ 421 struct taprio_sched *q = qdisc_priv(sch); 422 struct Qdisc *child; 423 int queue; 424 425 queue = skb_get_queue_mapping(skb); 426 427 child = q->qdiscs[queue]; 428 if (unlikely(!child)) 429 return qdisc_drop(skb, sch, to_free); 430 431 /* sk_flags are only safe to use on full sockets. */ 432 if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { 433 if (!is_valid_interval(skb, sch)) 434 return qdisc_drop(skb, sch, to_free); 435 } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { 436 skb->tstamp = get_packet_txtime(skb, sch); 437 if (!skb->tstamp) 438 return qdisc_drop(skb, sch, to_free); 439 } 440 441 qdisc_qstats_backlog_inc(sch, skb); 442 sch->q.qlen++; 443 444 return qdisc_enqueue(skb, child, to_free); 445} 446 447static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) 448{ 449 struct taprio_sched *q = qdisc_priv(sch); 450 struct net_device *dev = qdisc_dev(sch); 451 struct sched_entry *entry; 452 struct sk_buff *skb; 453 u32 gate_mask; 454 int i; 455 456 rcu_read_lock(); 457 entry = rcu_dereference(q->current_entry); 458 gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; 459 rcu_read_unlock(); 460 461 if (!gate_mask) 462 return NULL; 463 464 for (i = 0; i < dev->num_tx_queues; i++) { 465 struct Qdisc *child = q->qdiscs[i]; 466 int prio; 467 u8 tc; 468 469 if (unlikely(!child)) 470 continue; 471 472 skb = child->ops->peek(child); 473 if (!skb) 474 continue; 475 476 if (TXTIME_ASSIST_IS_ENABLED(q->flags)) 477 return skb; 478 479 prio = skb->priority; 480 tc = netdev_get_prio_tc_map(dev, prio); 481 482 if (!(gate_mask & BIT(tc))) 483 continue; 484 485 return skb; 486 } 487 488 return NULL; 489} 490 491static struct sk_buff *taprio_peek_offload(struct Qdisc *sch) 492{ 493 struct taprio_sched *q = qdisc_priv(sch); 494 struct net_device *dev = qdisc_dev(sch); 495 struct sk_buff *skb; 496 int i; 497 498 for (i = 0; i < dev->num_tx_queues; i++) { 499 struct Qdisc *child = q->qdiscs[i]; 500 501 if (unlikely(!child)) 502 continue; 503 504 skb = child->ops->peek(child); 505 if (!skb) 506 continue; 507 508 return skb; 509 } 510 511 return NULL; 512} 513 514static struct sk_buff *taprio_peek(struct Qdisc *sch) 515{ 516 struct taprio_sched *q = qdisc_priv(sch); 517 518 return q->peek(sch); 519} 520 521static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) 522{ 523 atomic_set(&entry->budget, 524 div64_u64((u64)entry->interval * 1000, 525 atomic64_read(&q->picos_per_byte))); 526} 527 528static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) 529{ 530 struct taprio_sched *q = qdisc_priv(sch); 531 struct net_device *dev = qdisc_dev(sch); 532 struct sk_buff *skb = NULL; 533 struct sched_entry *entry; 534 u32 gate_mask; 535 int i; 536 537 rcu_read_lock(); 538 entry = rcu_dereference(q->current_entry); 539 /* if there's no entry, it means that the schedule didn't 540 * start yet, so force all gates to be open, this is in 541 * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 542 * "AdminGateSates" 543 */ 544 gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; 545 546 if (!gate_mask) 547 goto done; 548 549 for (i = 0; i < dev->num_tx_queues; i++) { 550 struct Qdisc *child = q->qdiscs[i]; 551 ktime_t guard; 552 int prio; 553 int len; 554 u8 tc; 555 556 if (unlikely(!child)) 557 continue; 558 559 if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { 560 skb = child->ops->dequeue(child); 561 if (!skb) 562 continue; 563 goto skb_found; 564 } 565 566 skb = child->ops->peek(child); 567 if (!skb) 568 continue; 569 570 prio = skb->priority; 571 tc = netdev_get_prio_tc_map(dev, prio); 572 573 if (!(gate_mask & BIT(tc))) { 574 skb = NULL; 575 continue; 576 } 577 578 len = qdisc_pkt_len(skb); 579 guard = ktime_add_ns(taprio_get_time(q), 580 length_to_duration(q, len)); 581 582 /* In the case that there's no gate entry, there's no 583 * guard band ... 584 */ 585 if (gate_mask != TAPRIO_ALL_GATES_OPEN && 586 ktime_after(guard, entry->close_time)) { 587 skb = NULL; 588 continue; 589 } 590 591 /* ... and no budget. */ 592 if (gate_mask != TAPRIO_ALL_GATES_OPEN && 593 atomic_sub_return(len, &entry->budget) < 0) { 594 skb = NULL; 595 continue; 596 } 597 598 skb = child->ops->dequeue(child); 599 if (unlikely(!skb)) 600 goto done; 601 602skb_found: 603 qdisc_bstats_update(sch, skb); 604 qdisc_qstats_backlog_dec(sch, skb); 605 sch->q.qlen--; 606 607 goto done; 608 } 609 610done: 611 rcu_read_unlock(); 612 613 return skb; 614} 615 616static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch) 617{ 618 struct taprio_sched *q = qdisc_priv(sch); 619 struct net_device *dev = qdisc_dev(sch); 620 struct sk_buff *skb; 621 int i; 622 623 for (i = 0; i < dev->num_tx_queues; i++) { 624 struct Qdisc *child = q->qdiscs[i]; 625 626 if (unlikely(!child)) 627 continue; 628 629 skb = child->ops->dequeue(child); 630 if (unlikely(!skb)) 631 continue; 632 633 qdisc_bstats_update(sch, skb); 634 qdisc_qstats_backlog_dec(sch, skb); 635 sch->q.qlen--; 636 637 return skb; 638 } 639 640 return NULL; 641} 642 643static struct sk_buff *taprio_dequeue(struct Qdisc *sch) 644{ 645 struct taprio_sched *q = qdisc_priv(sch); 646 647 return q->dequeue(sch); 648} 649 650static bool should_restart_cycle(const struct sched_gate_list *oper, 651 const struct sched_entry *entry) 652{ 653 if (list_is_last(&entry->list, &oper->entries)) 654 return true; 655 656 if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0) 657 return true; 658 659 return false; 660} 661 662static bool should_change_schedules(const struct sched_gate_list *admin, 663 const struct sched_gate_list *oper, 664 ktime_t close_time) 665{ 666 ktime_t next_base_time, extension_time; 667 668 if (!admin) 669 return false; 670 671 next_base_time = sched_base_time(admin); 672 673 /* This is the simple case, the close_time would fall after 674 * the next schedule base_time. 675 */ 676 if (ktime_compare(next_base_time, close_time) <= 0) 677 return true; 678 679 /* This is the cycle_time_extension case, if the close_time 680 * plus the amount that can be extended would fall after the 681 * next schedule base_time, we can extend the current schedule 682 * for that amount. 683 */ 684 extension_time = ktime_add_ns(close_time, oper->cycle_time_extension); 685 686 /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about 687 * how precisely the extension should be made. So after 688 * conformance testing, this logic may change. 689 */ 690 if (ktime_compare(next_base_time, extension_time) <= 0) 691 return true; 692 693 return false; 694} 695 696static enum hrtimer_restart advance_sched(struct hrtimer *timer) 697{ 698 struct taprio_sched *q = container_of(timer, struct taprio_sched, 699 advance_timer); 700 struct sched_gate_list *oper, *admin; 701 struct sched_entry *entry, *next; 702 struct Qdisc *sch = q->root; 703 ktime_t close_time; 704 705 spin_lock(&q->current_entry_lock); 706 entry = rcu_dereference_protected(q->current_entry, 707 lockdep_is_held(&q->current_entry_lock)); 708 oper = rcu_dereference_protected(q->oper_sched, 709 lockdep_is_held(&q->current_entry_lock)); 710 admin = rcu_dereference_protected(q->admin_sched, 711 lockdep_is_held(&q->current_entry_lock)); 712 713 if (!oper) 714 switch_schedules(q, &admin, &oper); 715 716 /* This can happen in two cases: 1. this is the very first run 717 * of this function (i.e. we weren't running any schedule 718 * previously); 2. The previous schedule just ended. The first 719 * entry of all schedules are pre-calculated during the 720 * schedule initialization. 721 */ 722 if (unlikely(!entry || entry->close_time == oper->base_time)) { 723 next = list_first_entry(&oper->entries, struct sched_entry, 724 list); 725 close_time = next->close_time; 726 goto first_run; 727 } 728 729 if (should_restart_cycle(oper, entry)) { 730 next = list_first_entry(&oper->entries, struct sched_entry, 731 list); 732 oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time, 733 oper->cycle_time); 734 } else { 735 next = list_next_entry(entry, list); 736 } 737 738 close_time = ktime_add_ns(entry->close_time, next->interval); 739 close_time = min_t(ktime_t, close_time, oper->cycle_close_time); 740 741 if (should_change_schedules(admin, oper, close_time)) { 742 /* Set things so the next time this runs, the new 743 * schedule runs. 744 */ 745 close_time = sched_base_time(admin); 746 switch_schedules(q, &admin, &oper); 747 } 748 749 next->close_time = close_time; 750 taprio_set_budget(q, next); 751 752first_run: 753 rcu_assign_pointer(q->current_entry, next); 754 spin_unlock(&q->current_entry_lock); 755 756 hrtimer_set_expires(&q->advance_timer, close_time); 757 758 rcu_read_lock(); 759 __netif_schedule(sch); 760 rcu_read_unlock(); 761 762 return HRTIMER_RESTART; 763} 764 765static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { 766 [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, 767 [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, 768 [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, 769 [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, 770}; 771 772static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { 773 [TCA_TAPRIO_ATTR_PRIOMAP] = { 774 .len = sizeof(struct tc_mqprio_qopt) 775 }, 776 [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, 777 [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, 778 [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, 779 [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, 780 [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, 781 [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, 782 [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, 783 [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, 784}; 785 786static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb, 787 struct sched_entry *entry, 788 struct netlink_ext_ack *extack) 789{ 790 int min_duration = length_to_duration(q, ETH_ZLEN); 791 u32 interval = 0; 792 793 if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) 794 entry->command = nla_get_u8( 795 tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); 796 797 if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) 798 entry->gate_mask = nla_get_u32( 799 tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); 800 801 if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) 802 interval = nla_get_u32( 803 tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); 804 805 /* The interval should allow at least the minimum ethernet 806 * frame to go out. 807 */ 808 if (interval < min_duration) { 809 NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); 810 return -EINVAL; 811 } 812 813 entry->interval = interval; 814 815 return 0; 816} 817 818static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n, 819 struct sched_entry *entry, int index, 820 struct netlink_ext_ack *extack) 821{ 822 struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; 823 int err; 824 825 err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, 826 entry_policy, NULL); 827 if (err < 0) { 828 NL_SET_ERR_MSG(extack, "Could not parse nested entry"); 829 return -EINVAL; 830 } 831 832 entry->index = index; 833 834 return fill_sched_entry(q, tb, entry, extack); 835} 836 837static int parse_sched_list(struct taprio_sched *q, struct nlattr *list, 838 struct sched_gate_list *sched, 839 struct netlink_ext_ack *extack) 840{ 841 struct nlattr *n; 842 int err, rem; 843 int i = 0; 844 845 if (!list) 846 return -EINVAL; 847 848 nla_for_each_nested(n, list, rem) { 849 struct sched_entry *entry; 850 851 if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { 852 NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); 853 continue; 854 } 855 856 entry = kzalloc(sizeof(*entry), GFP_KERNEL); 857 if (!entry) { 858 NL_SET_ERR_MSG(extack, "Not enough memory for entry"); 859 return -ENOMEM; 860 } 861 862 err = parse_sched_entry(q, n, entry, i, extack); 863 if (err < 0) { 864 kfree(entry); 865 return err; 866 } 867 868 list_add_tail(&entry->list, &sched->entries); 869 i++; 870 } 871 872 sched->num_entries = i; 873 874 return i; 875} 876 877static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, 878 struct sched_gate_list *new, 879 struct netlink_ext_ack *extack) 880{ 881 int err = 0; 882 883 if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { 884 NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); 885 return -ENOTSUPP; 886 } 887 888 if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) 889 new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); 890 891 if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) 892 new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); 893 894 if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) 895 new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); 896 897 if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) 898 err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], 899 new, extack); 900 if (err < 0) 901 return err; 902 903 if (!new->cycle_time) { 904 struct sched_entry *entry; 905 ktime_t cycle = 0; 906 907 list_for_each_entry(entry, &new->entries, list) 908 cycle = ktime_add_ns(cycle, entry->interval); 909 910 if (!cycle) { 911 NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); 912 return -EINVAL; 913 } 914 915 new->cycle_time = cycle; 916 } 917 918 return 0; 919} 920 921static int taprio_parse_mqprio_opt(struct net_device *dev, 922 struct tc_mqprio_qopt *qopt, 923 struct netlink_ext_ack *extack, 924 u32 taprio_flags) 925{ 926 int i, j; 927 928 if (!qopt && !dev->num_tc) { 929 NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); 930 return -EINVAL; 931 } 932 933 /* If num_tc is already set, it means that the user already 934 * configured the mqprio part 935 */ 936 if (dev->num_tc) 937 return 0; 938 939 /* Verify num_tc is not out of max range */ 940 if (qopt->num_tc > TC_MAX_QUEUE) { 941 NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); 942 return -EINVAL; 943 } 944 945 /* taprio imposes that traffic classes map 1:n to tx queues */ 946 if (qopt->num_tc > dev->num_tx_queues) { 947 NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); 948 return -EINVAL; 949 } 950 951 /* Verify priority mapping uses valid tcs */ 952 for (i = 0; i <= TC_BITMASK; i++) { 953 if (qopt->prio_tc_map[i] >= qopt->num_tc) { 954 NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); 955 return -EINVAL; 956 } 957 } 958 959 for (i = 0; i < qopt->num_tc; i++) { 960 unsigned int last = qopt->offset[i] + qopt->count[i]; 961 962 /* Verify the queue count is in tx range being equal to the 963 * real_num_tx_queues indicates the last queue is in use. 964 */ 965 if (qopt->offset[i] >= dev->num_tx_queues || 966 !qopt->count[i] || 967 last > dev->real_num_tx_queues) { 968 NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping"); 969 return -EINVAL; 970 } 971 972 if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) 973 continue; 974 975 /* Verify that the offset and counts do not overlap */ 976 for (j = i + 1; j < qopt->num_tc; j++) { 977 if (last > qopt->offset[j]) { 978 NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping"); 979 return -EINVAL; 980 } 981 } 982 } 983 984 return 0; 985} 986 987static int taprio_get_start_time(struct Qdisc *sch, 988 struct sched_gate_list *sched, 989 ktime_t *start) 990{ 991 struct taprio_sched *q = qdisc_priv(sch); 992 ktime_t now, base, cycle; 993 s64 n; 994 995 base = sched_base_time(sched); 996 now = taprio_get_time(q); 997 998 if (ktime_after(base, now)) { 999 *start = base; 1000 return 0; 1001 } 1002 1003 cycle = sched->cycle_time; 1004 1005 /* The qdisc is expected to have at least one sched_entry. Moreover, 1006 * any entry must have 'interval' > 0. Thus if the cycle time is zero, 1007 * something went really wrong. In that case, we should warn about this 1008 * inconsistent state and return error. 1009 */ 1010 if (WARN_ON(!cycle)) 1011 return -EFAULT; 1012 1013 /* Schedule the start time for the beginning of the next 1014 * cycle. 1015 */ 1016 n = div64_s64(ktime_sub_ns(now, base), cycle); 1017 *start = ktime_add_ns(base, (n + 1) * cycle); 1018 return 0; 1019} 1020 1021static void setup_first_close_time(struct taprio_sched *q, 1022 struct sched_gate_list *sched, ktime_t base) 1023{ 1024 struct sched_entry *first; 1025 ktime_t cycle; 1026 1027 first = list_first_entry(&sched->entries, 1028 struct sched_entry, list); 1029 1030 cycle = sched->cycle_time; 1031 1032 /* FIXME: find a better place to do this */ 1033 sched->cycle_close_time = ktime_add_ns(base, cycle); 1034 1035 first->close_time = ktime_add_ns(base, first->interval); 1036 taprio_set_budget(q, first); 1037 rcu_assign_pointer(q->current_entry, NULL); 1038} 1039 1040static void taprio_start_sched(struct Qdisc *sch, 1041 ktime_t start, struct sched_gate_list *new) 1042{ 1043 struct taprio_sched *q = qdisc_priv(sch); 1044 ktime_t expires; 1045 1046 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) 1047 return; 1048 1049 expires = hrtimer_get_expires(&q->advance_timer); 1050 if (expires == 0) 1051 expires = KTIME_MAX; 1052 1053 /* If the new schedule starts before the next expiration, we 1054 * reprogram it to the earliest one, so we change the admin 1055 * schedule to the operational one at the right time. 1056 */ 1057 start = min_t(ktime_t, start, expires); 1058 1059 hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); 1060} 1061 1062static void taprio_set_picos_per_byte(struct net_device *dev, 1063 struct taprio_sched *q) 1064{ 1065 struct ethtool_link_ksettings ecmd; 1066 int speed = SPEED_10; 1067 int picos_per_byte; 1068 int err; 1069 1070 err = __ethtool_get_link_ksettings(dev, &ecmd); 1071 if (err < 0) 1072 goto skip; 1073 1074 if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) 1075 speed = ecmd.base.speed; 1076 1077skip: 1078 picos_per_byte = (USEC_PER_SEC * 8) / speed; 1079 1080 atomic64_set(&q->picos_per_byte, picos_per_byte); 1081 netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", 1082 dev->name, (long long)atomic64_read(&q->picos_per_byte), 1083 ecmd.base.speed); 1084} 1085 1086static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, 1087 void *ptr) 1088{ 1089 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1090 struct net_device *qdev; 1091 struct taprio_sched *q; 1092 bool found = false; 1093 1094 ASSERT_RTNL(); 1095 1096 if (event != NETDEV_UP && event != NETDEV_CHANGE) 1097 return NOTIFY_DONE; 1098 1099 spin_lock(&taprio_list_lock); 1100 list_for_each_entry(q, &taprio_list, taprio_list) { 1101 qdev = qdisc_dev(q->root); 1102 if (qdev == dev) { 1103 found = true; 1104 break; 1105 } 1106 } 1107 spin_unlock(&taprio_list_lock); 1108 1109 if (found) 1110 taprio_set_picos_per_byte(dev, q); 1111 1112 return NOTIFY_DONE; 1113} 1114 1115static void setup_txtime(struct taprio_sched *q, 1116 struct sched_gate_list *sched, ktime_t base) 1117{ 1118 struct sched_entry *entry; 1119 u32 interval = 0; 1120 1121 list_for_each_entry(entry, &sched->entries, list) { 1122 entry->next_txtime = ktime_add_ns(base, interval); 1123 interval += entry->interval; 1124 } 1125} 1126 1127static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) 1128{ 1129 struct __tc_taprio_qopt_offload *__offload; 1130 1131 __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), 1132 GFP_KERNEL); 1133 if (!__offload) 1134 return NULL; 1135 1136 refcount_set(&__offload->users, 1); 1137 1138 return &__offload->offload; 1139} 1140 1141struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload 1142 *offload) 1143{ 1144 struct __tc_taprio_qopt_offload *__offload; 1145 1146 __offload = container_of(offload, struct __tc_taprio_qopt_offload, 1147 offload); 1148 1149 refcount_inc(&__offload->users); 1150 1151 return offload; 1152} 1153EXPORT_SYMBOL_GPL(taprio_offload_get); 1154 1155void taprio_offload_free(struct tc_taprio_qopt_offload *offload) 1156{ 1157 struct __tc_taprio_qopt_offload *__offload; 1158 1159 __offload = container_of(offload, struct __tc_taprio_qopt_offload, 1160 offload); 1161 1162 if (!refcount_dec_and_test(&__offload->users)) 1163 return; 1164 1165 kfree(__offload); 1166} 1167EXPORT_SYMBOL_GPL(taprio_offload_free); 1168 1169/* The function will only serve to keep the pointers to the "oper" and "admin" 1170 * schedules valid in relation to their base times, so when calling dump() the 1171 * users looks at the right schedules. 1172 * When using full offload, the admin configuration is promoted to oper at the 1173 * base_time in the PHC time domain. But because the system time is not 1174 * necessarily in sync with that, we can't just trigger a hrtimer to call 1175 * switch_schedules at the right hardware time. 1176 * At the moment we call this by hand right away from taprio, but in the future 1177 * it will be useful to create a mechanism for drivers to notify taprio of the 1178 * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). 1179 * This is left as TODO. 1180 */ 1181static void taprio_offload_config_changed(struct taprio_sched *q) 1182{ 1183 struct sched_gate_list *oper, *admin; 1184 1185 spin_lock(&q->current_entry_lock); 1186 1187 oper = rcu_dereference_protected(q->oper_sched, 1188 lockdep_is_held(&q->current_entry_lock)); 1189 admin = rcu_dereference_protected(q->admin_sched, 1190 lockdep_is_held(&q->current_entry_lock)); 1191 1192 switch_schedules(q, &admin, &oper); 1193 1194 spin_unlock(&q->current_entry_lock); 1195} 1196 1197static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) 1198{ 1199 u32 i, queue_mask = 0; 1200 1201 for (i = 0; i < dev->num_tc; i++) { 1202 u32 offset, count; 1203 1204 if (!(tc_mask & BIT(i))) 1205 continue; 1206 1207 offset = dev->tc_to_txq[i].offset; 1208 count = dev->tc_to_txq[i].count; 1209 1210 queue_mask |= GENMASK(offset + count - 1, offset); 1211 } 1212 1213 return queue_mask; 1214} 1215 1216static void taprio_sched_to_offload(struct net_device *dev, 1217 struct sched_gate_list *sched, 1218 struct tc_taprio_qopt_offload *offload) 1219{ 1220 struct sched_entry *entry; 1221 int i = 0; 1222 1223 offload->base_time = sched->base_time; 1224 offload->cycle_time = sched->cycle_time; 1225 offload->cycle_time_extension = sched->cycle_time_extension; 1226 1227 list_for_each_entry(entry, &sched->entries, list) { 1228 struct tc_taprio_sched_entry *e = &offload->entries[i]; 1229 1230 e->command = entry->command; 1231 e->interval = entry->interval; 1232 e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); 1233 1234 i++; 1235 } 1236 1237 offload->num_entries = i; 1238} 1239 1240static int taprio_enable_offload(struct net_device *dev, 1241 struct taprio_sched *q, 1242 struct sched_gate_list *sched, 1243 struct netlink_ext_ack *extack) 1244{ 1245 const struct net_device_ops *ops = dev->netdev_ops; 1246 struct tc_taprio_qopt_offload *offload; 1247 int err = 0; 1248 1249 if (!ops->ndo_setup_tc) { 1250 NL_SET_ERR_MSG(extack, 1251 "Device does not support taprio offload"); 1252 return -EOPNOTSUPP; 1253 } 1254 1255 offload = taprio_offload_alloc(sched->num_entries); 1256 if (!offload) { 1257 NL_SET_ERR_MSG(extack, 1258 "Not enough memory for enabling offload mode"); 1259 return -ENOMEM; 1260 } 1261 offload->enable = 1; 1262 taprio_sched_to_offload(dev, sched, offload); 1263 1264 err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); 1265 if (err < 0) { 1266 NL_SET_ERR_MSG(extack, 1267 "Device failed to setup taprio offload"); 1268 goto done; 1269 } 1270 1271 q->offloaded = true; 1272 1273done: 1274 taprio_offload_free(offload); 1275 1276 return err; 1277} 1278 1279static int taprio_disable_offload(struct net_device *dev, 1280 struct taprio_sched *q, 1281 struct netlink_ext_ack *extack) 1282{ 1283 const struct net_device_ops *ops = dev->netdev_ops; 1284 struct tc_taprio_qopt_offload *offload; 1285 int err; 1286 1287 if (!q->offloaded) 1288 return 0; 1289 1290 offload = taprio_offload_alloc(0); 1291 if (!offload) { 1292 NL_SET_ERR_MSG(extack, 1293 "Not enough memory to disable offload mode"); 1294 return -ENOMEM; 1295 } 1296 offload->enable = 0; 1297 1298 err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); 1299 if (err < 0) { 1300 NL_SET_ERR_MSG(extack, 1301 "Device failed to disable offload"); 1302 goto out; 1303 } 1304 1305 q->offloaded = false; 1306 1307out: 1308 taprio_offload_free(offload); 1309 1310 return err; 1311} 1312 1313/* If full offload is enabled, the only possible clockid is the net device's 1314 * PHC. For that reason, specifying a clockid through netlink is incorrect. 1315 * For txtime-assist, it is implicitly assumed that the device's PHC is kept 1316 * in sync with the specified clockid via a user space daemon such as phc2sys. 1317 * For both software taprio and txtime-assist, the clockid is used for the 1318 * hrtimer that advances the schedule and hence mandatory. 1319 */ 1320static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, 1321 struct netlink_ext_ack *extack) 1322{ 1323 struct taprio_sched *q = qdisc_priv(sch); 1324 struct net_device *dev = qdisc_dev(sch); 1325 int err = -EINVAL; 1326 1327 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { 1328 const struct ethtool_ops *ops = dev->ethtool_ops; 1329 struct ethtool_ts_info info = { 1330 .cmd = ETHTOOL_GET_TS_INFO, 1331 .phc_index = -1, 1332 }; 1333 1334 if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { 1335 NL_SET_ERR_MSG(extack, 1336 "The 'clockid' cannot be specified for full offload"); 1337 goto out; 1338 } 1339 1340 if (ops && ops->get_ts_info) 1341 err = ops->get_ts_info(dev, &info); 1342 1343 if (err || info.phc_index < 0) { 1344 NL_SET_ERR_MSG(extack, 1345 "Device does not have a PTP clock"); 1346 err = -ENOTSUPP; 1347 goto out; 1348 } 1349 } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { 1350 int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); 1351 enum tk_offsets tk_offset; 1352 1353 /* We only support static clockids and we don't allow 1354 * for it to be modified after the first init. 1355 */ 1356 if (clockid < 0 || 1357 (q->clockid != -1 && q->clockid != clockid)) { 1358 NL_SET_ERR_MSG(extack, 1359 "Changing the 'clockid' of a running schedule is not supported"); 1360 err = -ENOTSUPP; 1361 goto out; 1362 } 1363 1364 switch (clockid) { 1365 case CLOCK_REALTIME: 1366 tk_offset = TK_OFFS_REAL; 1367 break; 1368 case CLOCK_MONOTONIC: 1369 tk_offset = TK_OFFS_MAX; 1370 break; 1371 case CLOCK_BOOTTIME: 1372 tk_offset = TK_OFFS_BOOT; 1373 break; 1374 case CLOCK_TAI: 1375 tk_offset = TK_OFFS_TAI; 1376 break; 1377 default: 1378 NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); 1379 err = -EINVAL; 1380 goto out; 1381 } 1382 /* This pairs with READ_ONCE() in taprio_mono_to_any */ 1383 WRITE_ONCE(q->tk_offset, tk_offset); 1384 1385 q->clockid = clockid; 1386 } else { 1387 NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); 1388 goto out; 1389 } 1390 1391 /* Everything went ok, return success. */ 1392 err = 0; 1393 1394out: 1395 return err; 1396} 1397 1398static int taprio_mqprio_cmp(const struct net_device *dev, 1399 const struct tc_mqprio_qopt *mqprio) 1400{ 1401 int i; 1402 1403 if (!mqprio || mqprio->num_tc != dev->num_tc) 1404 return -1; 1405 1406 for (i = 0; i < mqprio->num_tc; i++) 1407 if (dev->tc_to_txq[i].count != mqprio->count[i] || 1408 dev->tc_to_txq[i].offset != mqprio->offset[i]) 1409 return -1; 1410 1411 for (i = 0; i <= TC_BITMASK; i++) 1412 if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) 1413 return -1; 1414 1415 return 0; 1416} 1417 1418/* The semantics of the 'flags' argument in relation to 'change()' 1419 * requests, are interpreted following two rules (which are applied in 1420 * this order): (1) an omitted 'flags' argument is interpreted as 1421 * zero; (2) the 'flags' of a "running" taprio instance cannot be 1422 * changed. 1423 */ 1424static int taprio_new_flags(const struct nlattr *attr, u32 old, 1425 struct netlink_ext_ack *extack) 1426{ 1427 u32 new = 0; 1428 1429 if (attr) 1430 new = nla_get_u32(attr); 1431 1432 if (old != TAPRIO_FLAGS_INVALID && old != new) { 1433 NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); 1434 return -EOPNOTSUPP; 1435 } 1436 1437 if (!taprio_flags_valid(new)) { 1438 NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); 1439 return -EINVAL; 1440 } 1441 1442 return new; 1443} 1444 1445static int taprio_change(struct Qdisc *sch, struct nlattr *opt, 1446 struct netlink_ext_ack *extack) 1447{ 1448 struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; 1449 struct sched_gate_list *oper, *admin, *new_admin; 1450 struct taprio_sched *q = qdisc_priv(sch); 1451 struct net_device *dev = qdisc_dev(sch); 1452 struct tc_mqprio_qopt *mqprio = NULL; 1453 unsigned long flags; 1454 ktime_t start; 1455 int i, err; 1456 1457 err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, 1458 taprio_policy, extack); 1459 if (err < 0) 1460 return err; 1461 1462 if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) 1463 mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); 1464 1465 err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS], 1466 q->flags, extack); 1467 if (err < 0) 1468 return err; 1469 1470 q->flags = err; 1471 1472 err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); 1473 if (err < 0) 1474 return err; 1475 1476 new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); 1477 if (!new_admin) { 1478 NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); 1479 return -ENOMEM; 1480 } 1481 INIT_LIST_HEAD(&new_admin->entries); 1482 1483 rcu_read_lock(); 1484 oper = rcu_dereference(q->oper_sched); 1485 admin = rcu_dereference(q->admin_sched); 1486 rcu_read_unlock(); 1487 1488 /* no changes - no new mqprio settings */ 1489 if (!taprio_mqprio_cmp(dev, mqprio)) 1490 mqprio = NULL; 1491 1492 if (mqprio && (oper || admin)) { 1493 NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); 1494 err = -ENOTSUPP; 1495 goto free_sched; 1496 } 1497 1498 err = parse_taprio_schedule(q, tb, new_admin, extack); 1499 if (err < 0) 1500 goto free_sched; 1501 1502 if (new_admin->num_entries == 0) { 1503 NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); 1504 err = -EINVAL; 1505 goto free_sched; 1506 } 1507 1508 err = taprio_parse_clockid(sch, tb, extack); 1509 if (err < 0) 1510 goto free_sched; 1511 1512 taprio_set_picos_per_byte(dev, q); 1513 1514 if (mqprio) { 1515 err = netdev_set_num_tc(dev, mqprio->num_tc); 1516 if (err) 1517 goto free_sched; 1518 for (i = 0; i < mqprio->num_tc; i++) 1519 netdev_set_tc_queue(dev, i, 1520 mqprio->count[i], 1521 mqprio->offset[i]); 1522 1523 /* Always use supplied priority mappings */ 1524 for (i = 0; i <= TC_BITMASK; i++) 1525 netdev_set_prio_tc_map(dev, i, 1526 mqprio->prio_tc_map[i]); 1527 } 1528 1529 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) 1530 err = taprio_enable_offload(dev, q, new_admin, extack); 1531 else 1532 err = taprio_disable_offload(dev, q, extack); 1533 if (err) 1534 goto free_sched; 1535 1536 /* Protects against enqueue()/dequeue() */ 1537 spin_lock_bh(qdisc_lock(sch)); 1538 1539 if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { 1540 if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { 1541 NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); 1542 err = -EINVAL; 1543 goto unlock; 1544 } 1545 1546 q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); 1547 } 1548 1549 if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && 1550 !FULL_OFFLOAD_IS_ENABLED(q->flags) && 1551 !hrtimer_active(&q->advance_timer)) { 1552 hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); 1553 q->advance_timer.function = advance_sched; 1554 } 1555 1556 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { 1557 q->dequeue = taprio_dequeue_offload; 1558 q->peek = taprio_peek_offload; 1559 } else { 1560 /* Be sure to always keep the function pointers 1561 * in a consistent state. 1562 */ 1563 q->dequeue = taprio_dequeue_soft; 1564 q->peek = taprio_peek_soft; 1565 } 1566 1567 err = taprio_get_start_time(sch, new_admin, &start); 1568 if (err < 0) { 1569 NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); 1570 goto unlock; 1571 } 1572 1573 setup_txtime(q, new_admin, start); 1574 1575 if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { 1576 if (!oper) { 1577 rcu_assign_pointer(q->oper_sched, new_admin); 1578 err = 0; 1579 new_admin = NULL; 1580 goto unlock; 1581 } 1582 1583 rcu_assign_pointer(q->admin_sched, new_admin); 1584 if (admin) 1585 call_rcu(&admin->rcu, taprio_free_sched_cb); 1586 } else { 1587 setup_first_close_time(q, new_admin, start); 1588 1589 /* Protects against advance_sched() */ 1590 spin_lock_irqsave(&q->current_entry_lock, flags); 1591 1592 taprio_start_sched(sch, start, new_admin); 1593 1594 rcu_assign_pointer(q->admin_sched, new_admin); 1595 if (admin) 1596 call_rcu(&admin->rcu, taprio_free_sched_cb); 1597 1598 spin_unlock_irqrestore(&q->current_entry_lock, flags); 1599 1600 if (FULL_OFFLOAD_IS_ENABLED(q->flags)) 1601 taprio_offload_config_changed(q); 1602 } 1603 1604 new_admin = NULL; 1605 err = 0; 1606 1607unlock: 1608 spin_unlock_bh(qdisc_lock(sch)); 1609 1610free_sched: 1611 if (new_admin) 1612 call_rcu(&new_admin->rcu, taprio_free_sched_cb); 1613 1614 return err; 1615} 1616 1617static void taprio_reset(struct Qdisc *sch) 1618{ 1619 struct taprio_sched *q = qdisc_priv(sch); 1620 struct net_device *dev = qdisc_dev(sch); 1621 int i; 1622 1623 hrtimer_cancel(&q->advance_timer); 1624 1625 if (q->qdiscs) { 1626 for (i = 0; i < dev->num_tx_queues; i++) 1627 if (q->qdiscs[i]) 1628 qdisc_reset(q->qdiscs[i]); 1629 } 1630} 1631 1632static void taprio_destroy(struct Qdisc *sch) 1633{ 1634 struct taprio_sched *q = qdisc_priv(sch); 1635 struct net_device *dev = qdisc_dev(sch); 1636 unsigned int i; 1637 1638 spin_lock(&taprio_list_lock); 1639 list_del(&q->taprio_list); 1640 spin_unlock(&taprio_list_lock); 1641 1642 /* Note that taprio_reset() might not be called if an error 1643 * happens in qdisc_create(), after taprio_init() has been called. 1644 */ 1645 hrtimer_cancel(&q->advance_timer); 1646 qdisc_synchronize(sch); 1647 1648 taprio_disable_offload(dev, q, NULL); 1649 1650 if (q->qdiscs) { 1651 for (i = 0; i < dev->num_tx_queues; i++) 1652 qdisc_put(q->qdiscs[i]); 1653 1654 kfree(q->qdiscs); 1655 } 1656 q->qdiscs = NULL; 1657 1658 netdev_reset_tc(dev); 1659 1660 if (q->oper_sched) 1661 call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb); 1662 1663 if (q->admin_sched) 1664 call_rcu(&q->admin_sched->rcu, taprio_free_sched_cb); 1665} 1666 1667static int taprio_init(struct Qdisc *sch, struct nlattr *opt, 1668 struct netlink_ext_ack *extack) 1669{ 1670 struct taprio_sched *q = qdisc_priv(sch); 1671 struct net_device *dev = qdisc_dev(sch); 1672 int i; 1673 1674 spin_lock_init(&q->current_entry_lock); 1675 1676 hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); 1677 q->advance_timer.function = advance_sched; 1678 1679 q->dequeue = taprio_dequeue_soft; 1680 q->peek = taprio_peek_soft; 1681 1682 q->root = sch; 1683 1684 /* We only support static clockids. Use an invalid value as default 1685 * and get the valid one on taprio_change(). 1686 */ 1687 q->clockid = -1; 1688 q->flags = TAPRIO_FLAGS_INVALID; 1689 1690 spin_lock(&taprio_list_lock); 1691 list_add(&q->taprio_list, &taprio_list); 1692 spin_unlock(&taprio_list_lock); 1693 1694 if (sch->parent != TC_H_ROOT) 1695 return -EOPNOTSUPP; 1696 1697 if (!netif_is_multiqueue(dev)) 1698 return -EOPNOTSUPP; 1699 1700 /* pre-allocate qdisc, attachment can't fail */ 1701 q->qdiscs = kcalloc(dev->num_tx_queues, 1702 sizeof(q->qdiscs[0]), 1703 GFP_KERNEL); 1704 1705 if (!q->qdiscs) 1706 return -ENOMEM; 1707 1708 if (!opt) 1709 return -EINVAL; 1710 1711 for (i = 0; i < dev->num_tx_queues; i++) { 1712 struct netdev_queue *dev_queue; 1713 struct Qdisc *qdisc; 1714 1715 dev_queue = netdev_get_tx_queue(dev, i); 1716 qdisc = qdisc_create_dflt(dev_queue, 1717 &pfifo_qdisc_ops, 1718 TC_H_MAKE(TC_H_MAJ(sch->handle), 1719 TC_H_MIN(i + 1)), 1720 extack); 1721 if (!qdisc) 1722 return -ENOMEM; 1723 1724 if (i < dev->real_num_tx_queues) 1725 qdisc_hash_add(qdisc, false); 1726 1727 q->qdiscs[i] = qdisc; 1728 } 1729 1730 return taprio_change(sch, opt, extack); 1731} 1732 1733static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, 1734 unsigned long cl) 1735{ 1736 struct net_device *dev = qdisc_dev(sch); 1737 unsigned long ntx = cl - 1; 1738 1739 if (ntx >= dev->num_tx_queues) 1740 return NULL; 1741 1742 return netdev_get_tx_queue(dev, ntx); 1743} 1744 1745static int taprio_graft(struct Qdisc *sch, unsigned long cl, 1746 struct Qdisc *new, struct Qdisc **old, 1747 struct netlink_ext_ack *extack) 1748{ 1749 struct taprio_sched *q = qdisc_priv(sch); 1750 struct net_device *dev = qdisc_dev(sch); 1751 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 1752 1753 if (!dev_queue) 1754 return -EINVAL; 1755 1756 if (dev->flags & IFF_UP) 1757 dev_deactivate(dev); 1758 1759 *old = q->qdiscs[cl - 1]; 1760 q->qdiscs[cl - 1] = new; 1761 1762 if (new) 1763 new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 1764 1765 if (dev->flags & IFF_UP) 1766 dev_activate(dev); 1767 1768 return 0; 1769} 1770 1771static int dump_entry(struct sk_buff *msg, 1772 const struct sched_entry *entry) 1773{ 1774 struct nlattr *item; 1775 1776 item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); 1777 if (!item) 1778 return -ENOSPC; 1779 1780 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) 1781 goto nla_put_failure; 1782 1783 if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) 1784 goto nla_put_failure; 1785 1786 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, 1787 entry->gate_mask)) 1788 goto nla_put_failure; 1789 1790 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, 1791 entry->interval)) 1792 goto nla_put_failure; 1793 1794 return nla_nest_end(msg, item); 1795 1796nla_put_failure: 1797 nla_nest_cancel(msg, item); 1798 return -1; 1799} 1800 1801static int dump_schedule(struct sk_buff *msg, 1802 const struct sched_gate_list *root) 1803{ 1804 struct nlattr *entry_list; 1805 struct sched_entry *entry; 1806 1807 if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, 1808 root->base_time, TCA_TAPRIO_PAD)) 1809 return -1; 1810 1811 if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, 1812 root->cycle_time, TCA_TAPRIO_PAD)) 1813 return -1; 1814 1815 if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, 1816 root->cycle_time_extension, TCA_TAPRIO_PAD)) 1817 return -1; 1818 1819 entry_list = nla_nest_start_noflag(msg, 1820 TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); 1821 if (!entry_list) 1822 goto error_nest; 1823 1824 list_for_each_entry(entry, &root->entries, list) { 1825 if (dump_entry(msg, entry) < 0) 1826 goto error_nest; 1827 } 1828 1829 nla_nest_end(msg, entry_list); 1830 return 0; 1831 1832error_nest: 1833 nla_nest_cancel(msg, entry_list); 1834 return -1; 1835} 1836 1837static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) 1838{ 1839 struct taprio_sched *q = qdisc_priv(sch); 1840 struct net_device *dev = qdisc_dev(sch); 1841 struct sched_gate_list *oper, *admin; 1842 struct tc_mqprio_qopt opt = { 0 }; 1843 struct nlattr *nest, *sched_nest; 1844 unsigned int i; 1845 1846 rcu_read_lock(); 1847 oper = rcu_dereference(q->oper_sched); 1848 admin = rcu_dereference(q->admin_sched); 1849 1850 opt.num_tc = netdev_get_num_tc(dev); 1851 memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); 1852 1853 for (i = 0; i < netdev_get_num_tc(dev); i++) { 1854 opt.count[i] = dev->tc_to_txq[i].count; 1855 opt.offset[i] = dev->tc_to_txq[i].offset; 1856 } 1857 1858 nest = nla_nest_start_noflag(skb, TCA_OPTIONS); 1859 if (!nest) 1860 goto start_error; 1861 1862 if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) 1863 goto options_error; 1864 1865 if (!FULL_OFFLOAD_IS_ENABLED(q->flags) && 1866 nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) 1867 goto options_error; 1868 1869 if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) 1870 goto options_error; 1871 1872 if (q->txtime_delay && 1873 nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) 1874 goto options_error; 1875 1876 if (oper && dump_schedule(skb, oper)) 1877 goto options_error; 1878 1879 if (!admin) 1880 goto done; 1881 1882 sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); 1883 if (!sched_nest) 1884 goto options_error; 1885 1886 if (dump_schedule(skb, admin)) 1887 goto admin_error; 1888 1889 nla_nest_end(skb, sched_nest); 1890 1891done: 1892 rcu_read_unlock(); 1893 1894 return nla_nest_end(skb, nest); 1895 1896admin_error: 1897 nla_nest_cancel(skb, sched_nest); 1898 1899options_error: 1900 nla_nest_cancel(skb, nest); 1901 1902start_error: 1903 rcu_read_unlock(); 1904 return -ENOSPC; 1905} 1906 1907static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) 1908{ 1909 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 1910 1911 if (!dev_queue) 1912 return NULL; 1913 1914 return dev_queue->qdisc_sleeping; 1915} 1916 1917static unsigned long taprio_find(struct Qdisc *sch, u32 classid) 1918{ 1919 unsigned int ntx = TC_H_MIN(classid); 1920 1921 if (!taprio_queue_get(sch, ntx)) 1922 return 0; 1923 return ntx; 1924} 1925 1926static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, 1927 struct sk_buff *skb, struct tcmsg *tcm) 1928{ 1929 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 1930 1931 tcm->tcm_parent = TC_H_ROOT; 1932 tcm->tcm_handle |= TC_H_MIN(cl); 1933 tcm->tcm_info = dev_queue->qdisc_sleeping->handle; 1934 1935 return 0; 1936} 1937 1938static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, 1939 struct gnet_dump *d) 1940 __releases(d->lock) 1941 __acquires(d->lock) 1942{ 1943 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 1944 1945 sch = dev_queue->qdisc_sleeping; 1946 if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || 1947 qdisc_qstats_copy(d, sch) < 0) 1948 return -1; 1949 return 0; 1950} 1951 1952static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) 1953{ 1954 struct net_device *dev = qdisc_dev(sch); 1955 unsigned long ntx; 1956 1957 if (arg->stop) 1958 return; 1959 1960 arg->count = arg->skip; 1961 for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { 1962 if (arg->fn(sch, ntx + 1, arg) < 0) { 1963 arg->stop = 1; 1964 break; 1965 } 1966 arg->count++; 1967 } 1968} 1969 1970static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, 1971 struct tcmsg *tcm) 1972{ 1973 return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); 1974} 1975 1976static const struct Qdisc_class_ops taprio_class_ops = { 1977 .graft = taprio_graft, 1978 .leaf = taprio_leaf, 1979 .find = taprio_find, 1980 .walk = taprio_walk, 1981 .dump = taprio_dump_class, 1982 .dump_stats = taprio_dump_class_stats, 1983 .select_queue = taprio_select_queue, 1984}; 1985 1986static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { 1987 .cl_ops = &taprio_class_ops, 1988 .id = "taprio", 1989 .priv_size = sizeof(struct taprio_sched), 1990 .init = taprio_init, 1991 .change = taprio_change, 1992 .destroy = taprio_destroy, 1993 .reset = taprio_reset, 1994 .peek = taprio_peek, 1995 .dequeue = taprio_dequeue, 1996 .enqueue = taprio_enqueue, 1997 .dump = taprio_dump, 1998 .owner = THIS_MODULE, 1999}; 2000 2001static struct notifier_block taprio_device_notifier = { 2002 .notifier_call = taprio_dev_notifier, 2003}; 2004 2005static int __init taprio_module_init(void) 2006{ 2007 int err = register_netdevice_notifier(&taprio_device_notifier); 2008 2009 if (err) 2010 return err; 2011 2012 return register_qdisc(&taprio_qdisc_ops); 2013} 2014 2015static void __exit taprio_module_exit(void) 2016{ 2017 unregister_qdisc(&taprio_qdisc_ops); 2018 unregister_netdevice_notifier(&taprio_device_notifier); 2019} 2020 2021module_init(taprio_module_init); 2022module_exit(taprio_module_exit); 2023MODULE_LICENSE("GPL"); 2024