162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * This software is available to you under a choice of one of two 562306a36Sopenharmony_ci * licenses. You may choose to be licensed under the terms of the GNU 662306a36Sopenharmony_ci * General Public License (GPL) Version 2, available from the file 762306a36Sopenharmony_ci * COPYING in the main directory of this source tree, or the 862306a36Sopenharmony_ci * OpenIB.org BSD license below: 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Redistribution and use in source and binary forms, with or 1162306a36Sopenharmony_ci * without modification, are permitted provided that the following 1262306a36Sopenharmony_ci * conditions are met: 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * - Redistributions of source code must retain the above 1562306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 1662306a36Sopenharmony_ci * disclaimer. 1762306a36Sopenharmony_ci * 1862306a36Sopenharmony_ci * - Redistributions in binary form must reproduce the above 1962306a36Sopenharmony_ci * copyright notice, this list of conditions and the following 2062306a36Sopenharmony_ci * disclaimer in the documentation and/or other materials 2162306a36Sopenharmony_ci * provided with the distribution. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2462306a36Sopenharmony_ci * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2562306a36Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2662306a36Sopenharmony_ci * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2762306a36Sopenharmony_ci * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2862306a36Sopenharmony_ci * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2962306a36Sopenharmony_ci * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3062306a36Sopenharmony_ci * SOFTWARE. 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci */ 3362306a36Sopenharmony_ci#include <linux/slab.h> 3462306a36Sopenharmony_ci#include <linux/types.h> 3562306a36Sopenharmony_ci#include <linux/rbtree.h> 3662306a36Sopenharmony_ci#include <linux/bitops.h> 3762306a36Sopenharmony_ci#include <linux/export.h> 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#include "rds.h" 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci/* 4262306a36Sopenharmony_ci * This file implements the receive side of the unconventional congestion 4362306a36Sopenharmony_ci * management in RDS. 4462306a36Sopenharmony_ci * 4562306a36Sopenharmony_ci * Messages waiting in the receive queue on the receiving socket are accounted 4662306a36Sopenharmony_ci * against the sockets SO_RCVBUF option value. Only the payload bytes in the 4762306a36Sopenharmony_ci * message are accounted for. If the number of bytes queued equals or exceeds 4862306a36Sopenharmony_ci * rcvbuf then the socket is congested. All sends attempted to this socket's 4962306a36Sopenharmony_ci * address should return block or return -EWOULDBLOCK. 5062306a36Sopenharmony_ci * 5162306a36Sopenharmony_ci * Applications are expected to be reasonably tuned such that this situation 5262306a36Sopenharmony_ci * very rarely occurs. An application encountering this "back-pressure" is 5362306a36Sopenharmony_ci * considered a bug. 5462306a36Sopenharmony_ci * 5562306a36Sopenharmony_ci * This is implemented by having each node maintain bitmaps which indicate 5662306a36Sopenharmony_ci * which ports on bound addresses are congested. As the bitmap changes it is 5762306a36Sopenharmony_ci * sent through all the connections which terminate in the local address of the 5862306a36Sopenharmony_ci * bitmap which changed. 5962306a36Sopenharmony_ci * 6062306a36Sopenharmony_ci * The bitmaps are allocated as connections are brought up. This avoids 6162306a36Sopenharmony_ci * allocation in the interrupt handling path which queues messages on sockets. 6262306a36Sopenharmony_ci * The dense bitmaps let transports send the entire bitmap on any bitmap change 6362306a36Sopenharmony_ci * reasonably efficiently. This is much easier to implement than some 6462306a36Sopenharmony_ci * finer-grained communication of per-port congestion. The sender does a very 6562306a36Sopenharmony_ci * inexpensive bit test to test if the port it's about to send to is congested 6662306a36Sopenharmony_ci * or not. 6762306a36Sopenharmony_ci */ 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci/* 7062306a36Sopenharmony_ci * Interaction with poll is a tad tricky. We want all processes stuck in 7162306a36Sopenharmony_ci * poll to wake up and check whether a congested destination became uncongested. 7262306a36Sopenharmony_ci * The really sad thing is we have no idea which destinations the application 7362306a36Sopenharmony_ci * wants to send to - we don't even know which rds_connections are involved. 7462306a36Sopenharmony_ci * So until we implement a more flexible rds poll interface, we have to make 7562306a36Sopenharmony_ci * do with this: 7662306a36Sopenharmony_ci * We maintain a global counter that is incremented each time a congestion map 7762306a36Sopenharmony_ci * update is received. Each rds socket tracks this value, and if rds_poll 7862306a36Sopenharmony_ci * finds that the saved generation number is smaller than the global generation 7962306a36Sopenharmony_ci * number, it wakes up the process. 8062306a36Sopenharmony_ci */ 8162306a36Sopenharmony_cistatic atomic_t rds_cong_generation = ATOMIC_INIT(0); 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci/* 8462306a36Sopenharmony_ci * Congestion monitoring 8562306a36Sopenharmony_ci */ 8662306a36Sopenharmony_cistatic LIST_HEAD(rds_cong_monitor); 8762306a36Sopenharmony_cistatic DEFINE_RWLOCK(rds_cong_monitor_lock); 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci/* 9062306a36Sopenharmony_ci * Yes, a global lock. It's used so infrequently that it's worth keeping it 9162306a36Sopenharmony_ci * global to simplify the locking. It's only used in the following 9262306a36Sopenharmony_ci * circumstances: 9362306a36Sopenharmony_ci * 9462306a36Sopenharmony_ci * - on connection buildup to associate a conn with its maps 9562306a36Sopenharmony_ci * - on map changes to inform conns of a new map to send 9662306a36Sopenharmony_ci * 9762306a36Sopenharmony_ci * It's sadly ordered under the socket callback lock and the connection lock. 9862306a36Sopenharmony_ci * Receive paths can mark ports congested from interrupt context so the 9962306a36Sopenharmony_ci * lock masks interrupts. 10062306a36Sopenharmony_ci */ 10162306a36Sopenharmony_cistatic DEFINE_SPINLOCK(rds_cong_lock); 10262306a36Sopenharmony_cistatic struct rb_root rds_cong_tree = RB_ROOT; 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_cistatic struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr, 10562306a36Sopenharmony_ci struct rds_cong_map *insert) 10662306a36Sopenharmony_ci{ 10762306a36Sopenharmony_ci struct rb_node **p = &rds_cong_tree.rb_node; 10862306a36Sopenharmony_ci struct rb_node *parent = NULL; 10962306a36Sopenharmony_ci struct rds_cong_map *map; 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci while (*p) { 11262306a36Sopenharmony_ci int diff; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci parent = *p; 11562306a36Sopenharmony_ci map = rb_entry(parent, struct rds_cong_map, m_rb_node); 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci diff = rds_addr_cmp(addr, &map->m_addr); 11862306a36Sopenharmony_ci if (diff < 0) 11962306a36Sopenharmony_ci p = &(*p)->rb_left; 12062306a36Sopenharmony_ci else if (diff > 0) 12162306a36Sopenharmony_ci p = &(*p)->rb_right; 12262306a36Sopenharmony_ci else 12362306a36Sopenharmony_ci return map; 12462306a36Sopenharmony_ci } 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci if (insert) { 12762306a36Sopenharmony_ci rb_link_node(&insert->m_rb_node, parent, p); 12862306a36Sopenharmony_ci rb_insert_color(&insert->m_rb_node, &rds_cong_tree); 12962306a36Sopenharmony_ci } 13062306a36Sopenharmony_ci return NULL; 13162306a36Sopenharmony_ci} 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci/* 13462306a36Sopenharmony_ci * There is only ever one bitmap for any address. Connections try and allocate 13562306a36Sopenharmony_ci * these bitmaps in the process getting pointers to them. The bitmaps are only 13662306a36Sopenharmony_ci * ever freed as the module is removed after all connections have been freed. 13762306a36Sopenharmony_ci */ 13862306a36Sopenharmony_cistatic struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr) 13962306a36Sopenharmony_ci{ 14062306a36Sopenharmony_ci struct rds_cong_map *map; 14162306a36Sopenharmony_ci struct rds_cong_map *ret = NULL; 14262306a36Sopenharmony_ci unsigned long zp; 14362306a36Sopenharmony_ci unsigned long i; 14462306a36Sopenharmony_ci unsigned long flags; 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); 14762306a36Sopenharmony_ci if (!map) 14862306a36Sopenharmony_ci return NULL; 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_ci map->m_addr = *addr; 15162306a36Sopenharmony_ci init_waitqueue_head(&map->m_waitq); 15262306a36Sopenharmony_ci INIT_LIST_HEAD(&map->m_conn_list); 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { 15562306a36Sopenharmony_ci zp = get_zeroed_page(GFP_KERNEL); 15662306a36Sopenharmony_ci if (zp == 0) 15762306a36Sopenharmony_ci goto out; 15862306a36Sopenharmony_ci map->m_page_addrs[i] = zp; 15962306a36Sopenharmony_ci } 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci spin_lock_irqsave(&rds_cong_lock, flags); 16262306a36Sopenharmony_ci ret = rds_cong_tree_walk(addr, map); 16362306a36Sopenharmony_ci spin_unlock_irqrestore(&rds_cong_lock, flags); 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci if (!ret) { 16662306a36Sopenharmony_ci ret = map; 16762306a36Sopenharmony_ci map = NULL; 16862306a36Sopenharmony_ci } 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ciout: 17162306a36Sopenharmony_ci if (map) { 17262306a36Sopenharmony_ci for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) 17362306a36Sopenharmony_ci free_page(map->m_page_addrs[i]); 17462306a36Sopenharmony_ci kfree(map); 17562306a36Sopenharmony_ci } 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci rdsdebug("map %p for addr %pI6c\n", ret, addr); 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_ci return ret; 18062306a36Sopenharmony_ci} 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci/* 18362306a36Sopenharmony_ci * Put the conn on its local map's list. This is called when the conn is 18462306a36Sopenharmony_ci * really added to the hash. It's nested under the rds_conn_lock, sadly. 18562306a36Sopenharmony_ci */ 18662306a36Sopenharmony_civoid rds_cong_add_conn(struct rds_connection *conn) 18762306a36Sopenharmony_ci{ 18862306a36Sopenharmony_ci unsigned long flags; 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); 19162306a36Sopenharmony_ci spin_lock_irqsave(&rds_cong_lock, flags); 19262306a36Sopenharmony_ci list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); 19362306a36Sopenharmony_ci spin_unlock_irqrestore(&rds_cong_lock, flags); 19462306a36Sopenharmony_ci} 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_civoid rds_cong_remove_conn(struct rds_connection *conn) 19762306a36Sopenharmony_ci{ 19862306a36Sopenharmony_ci unsigned long flags; 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); 20162306a36Sopenharmony_ci spin_lock_irqsave(&rds_cong_lock, flags); 20262306a36Sopenharmony_ci list_del_init(&conn->c_map_item); 20362306a36Sopenharmony_ci spin_unlock_irqrestore(&rds_cong_lock, flags); 20462306a36Sopenharmony_ci} 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ciint rds_cong_get_maps(struct rds_connection *conn) 20762306a36Sopenharmony_ci{ 20862306a36Sopenharmony_ci conn->c_lcong = rds_cong_from_addr(&conn->c_laddr); 20962306a36Sopenharmony_ci conn->c_fcong = rds_cong_from_addr(&conn->c_faddr); 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci if (!(conn->c_lcong && conn->c_fcong)) 21262306a36Sopenharmony_ci return -ENOMEM; 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci return 0; 21562306a36Sopenharmony_ci} 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_civoid rds_cong_queue_updates(struct rds_cong_map *map) 21862306a36Sopenharmony_ci{ 21962306a36Sopenharmony_ci struct rds_connection *conn; 22062306a36Sopenharmony_ci unsigned long flags; 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci spin_lock_irqsave(&rds_cong_lock, flags); 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ci list_for_each_entry(conn, &map->m_conn_list, c_map_item) { 22562306a36Sopenharmony_ci struct rds_conn_path *cp = &conn->c_path[0]; 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci rcu_read_lock(); 22862306a36Sopenharmony_ci if (!test_and_set_bit(0, &conn->c_map_queued) && 22962306a36Sopenharmony_ci !rds_destroy_pending(cp->cp_conn)) { 23062306a36Sopenharmony_ci rds_stats_inc(s_cong_update_queued); 23162306a36Sopenharmony_ci /* We cannot inline the call to rds_send_xmit() here 23262306a36Sopenharmony_ci * for two reasons (both pertaining to a TCP transport): 23362306a36Sopenharmony_ci * 1. When we get here from the receive path, we 23462306a36Sopenharmony_ci * are already holding the sock_lock (held by 23562306a36Sopenharmony_ci * tcp_v4_rcv()). So inlining calls to 23662306a36Sopenharmony_ci * tcp_setsockopt and/or tcp_sendmsg will deadlock 23762306a36Sopenharmony_ci * when it tries to get the sock_lock()) 23862306a36Sopenharmony_ci * 2. Interrupts are masked so that we can mark the 23962306a36Sopenharmony_ci * port congested from both send and recv paths. 24062306a36Sopenharmony_ci * (See comment around declaration of rdc_cong_lock). 24162306a36Sopenharmony_ci * An attempt to get the sock_lock() here will 24262306a36Sopenharmony_ci * therefore trigger warnings. 24362306a36Sopenharmony_ci * Defer the xmit to rds_send_worker() instead. 24462306a36Sopenharmony_ci */ 24562306a36Sopenharmony_ci queue_delayed_work(rds_wq, &cp->cp_send_w, 0); 24662306a36Sopenharmony_ci } 24762306a36Sopenharmony_ci rcu_read_unlock(); 24862306a36Sopenharmony_ci } 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci spin_unlock_irqrestore(&rds_cong_lock, flags); 25162306a36Sopenharmony_ci} 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_civoid rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) 25462306a36Sopenharmony_ci{ 25562306a36Sopenharmony_ci rdsdebug("waking map %p for %pI4\n", 25662306a36Sopenharmony_ci map, &map->m_addr); 25762306a36Sopenharmony_ci rds_stats_inc(s_cong_update_received); 25862306a36Sopenharmony_ci atomic_inc(&rds_cong_generation); 25962306a36Sopenharmony_ci if (waitqueue_active(&map->m_waitq)) 26062306a36Sopenharmony_ci wake_up(&map->m_waitq); 26162306a36Sopenharmony_ci if (waitqueue_active(&rds_poll_waitq)) 26262306a36Sopenharmony_ci wake_up_all(&rds_poll_waitq); 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci if (portmask && !list_empty(&rds_cong_monitor)) { 26562306a36Sopenharmony_ci unsigned long flags; 26662306a36Sopenharmony_ci struct rds_sock *rs; 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci read_lock_irqsave(&rds_cong_monitor_lock, flags); 26962306a36Sopenharmony_ci list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { 27062306a36Sopenharmony_ci spin_lock(&rs->rs_lock); 27162306a36Sopenharmony_ci rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); 27262306a36Sopenharmony_ci rs->rs_cong_mask &= ~portmask; 27362306a36Sopenharmony_ci spin_unlock(&rs->rs_lock); 27462306a36Sopenharmony_ci if (rs->rs_cong_notify) 27562306a36Sopenharmony_ci rds_wake_sk_sleep(rs); 27662306a36Sopenharmony_ci } 27762306a36Sopenharmony_ci read_unlock_irqrestore(&rds_cong_monitor_lock, flags); 27862306a36Sopenharmony_ci } 27962306a36Sopenharmony_ci} 28062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(rds_cong_map_updated); 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ciint rds_cong_updated_since(unsigned long *recent) 28362306a36Sopenharmony_ci{ 28462306a36Sopenharmony_ci unsigned long gen = atomic_read(&rds_cong_generation); 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci if (likely(*recent == gen)) 28762306a36Sopenharmony_ci return 0; 28862306a36Sopenharmony_ci *recent = gen; 28962306a36Sopenharmony_ci return 1; 29062306a36Sopenharmony_ci} 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci/* 29362306a36Sopenharmony_ci * We're called under the locking that protects the sockets receive buffer 29462306a36Sopenharmony_ci * consumption. This makes it a lot easier for the caller to only call us 29562306a36Sopenharmony_ci * when it knows that an existing set bit needs to be cleared, and vice versa. 29662306a36Sopenharmony_ci * We can't block and we need to deal with concurrent sockets working against 29762306a36Sopenharmony_ci * the same per-address map. 29862306a36Sopenharmony_ci */ 29962306a36Sopenharmony_civoid rds_cong_set_bit(struct rds_cong_map *map, __be16 port) 30062306a36Sopenharmony_ci{ 30162306a36Sopenharmony_ci unsigned long i; 30262306a36Sopenharmony_ci unsigned long off; 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci rdsdebug("setting congestion for %pI4:%u in map %p\n", 30562306a36Sopenharmony_ci &map->m_addr, ntohs(port), map); 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; 30862306a36Sopenharmony_ci off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci set_bit_le(off, (void *)map->m_page_addrs[i]); 31162306a36Sopenharmony_ci} 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_civoid rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) 31462306a36Sopenharmony_ci{ 31562306a36Sopenharmony_ci unsigned long i; 31662306a36Sopenharmony_ci unsigned long off; 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci rdsdebug("clearing congestion for %pI4:%u in map %p\n", 31962306a36Sopenharmony_ci &map->m_addr, ntohs(port), map); 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; 32262306a36Sopenharmony_ci off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci clear_bit_le(off, (void *)map->m_page_addrs[i]); 32562306a36Sopenharmony_ci} 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_cistatic int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) 32862306a36Sopenharmony_ci{ 32962306a36Sopenharmony_ci unsigned long i; 33062306a36Sopenharmony_ci unsigned long off; 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; 33362306a36Sopenharmony_ci off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci return test_bit_le(off, (void *)map->m_page_addrs[i]); 33662306a36Sopenharmony_ci} 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_civoid rds_cong_add_socket(struct rds_sock *rs) 33962306a36Sopenharmony_ci{ 34062306a36Sopenharmony_ci unsigned long flags; 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci write_lock_irqsave(&rds_cong_monitor_lock, flags); 34362306a36Sopenharmony_ci if (list_empty(&rs->rs_cong_list)) 34462306a36Sopenharmony_ci list_add(&rs->rs_cong_list, &rds_cong_monitor); 34562306a36Sopenharmony_ci write_unlock_irqrestore(&rds_cong_monitor_lock, flags); 34662306a36Sopenharmony_ci} 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_civoid rds_cong_remove_socket(struct rds_sock *rs) 34962306a36Sopenharmony_ci{ 35062306a36Sopenharmony_ci unsigned long flags; 35162306a36Sopenharmony_ci struct rds_cong_map *map; 35262306a36Sopenharmony_ci 35362306a36Sopenharmony_ci write_lock_irqsave(&rds_cong_monitor_lock, flags); 35462306a36Sopenharmony_ci list_del_init(&rs->rs_cong_list); 35562306a36Sopenharmony_ci write_unlock_irqrestore(&rds_cong_monitor_lock, flags); 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci /* update congestion map for now-closed port */ 35862306a36Sopenharmony_ci spin_lock_irqsave(&rds_cong_lock, flags); 35962306a36Sopenharmony_ci map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL); 36062306a36Sopenharmony_ci spin_unlock_irqrestore(&rds_cong_lock, flags); 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_ci if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { 36362306a36Sopenharmony_ci rds_cong_clear_bit(map, rs->rs_bound_port); 36462306a36Sopenharmony_ci rds_cong_queue_updates(map); 36562306a36Sopenharmony_ci } 36662306a36Sopenharmony_ci} 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ciint rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, 36962306a36Sopenharmony_ci struct rds_sock *rs) 37062306a36Sopenharmony_ci{ 37162306a36Sopenharmony_ci if (!rds_cong_test_bit(map, port)) 37262306a36Sopenharmony_ci return 0; 37362306a36Sopenharmony_ci if (nonblock) { 37462306a36Sopenharmony_ci if (rs && rs->rs_cong_monitor) { 37562306a36Sopenharmony_ci unsigned long flags; 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci /* It would have been nice to have an atomic set_bit on 37862306a36Sopenharmony_ci * a uint64_t. */ 37962306a36Sopenharmony_ci spin_lock_irqsave(&rs->rs_lock, flags); 38062306a36Sopenharmony_ci rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); 38162306a36Sopenharmony_ci spin_unlock_irqrestore(&rs->rs_lock, flags); 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_ci /* Test again - a congestion update may have arrived in 38462306a36Sopenharmony_ci * the meantime. */ 38562306a36Sopenharmony_ci if (!rds_cong_test_bit(map, port)) 38662306a36Sopenharmony_ci return 0; 38762306a36Sopenharmony_ci } 38862306a36Sopenharmony_ci rds_stats_inc(s_cong_send_error); 38962306a36Sopenharmony_ci return -ENOBUFS; 39062306a36Sopenharmony_ci } 39162306a36Sopenharmony_ci 39262306a36Sopenharmony_ci rds_stats_inc(s_cong_send_blocked); 39362306a36Sopenharmony_ci rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci return wait_event_interruptible(map->m_waitq, 39662306a36Sopenharmony_ci !rds_cong_test_bit(map, port)); 39762306a36Sopenharmony_ci} 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_civoid rds_cong_exit(void) 40062306a36Sopenharmony_ci{ 40162306a36Sopenharmony_ci struct rb_node *node; 40262306a36Sopenharmony_ci struct rds_cong_map *map; 40362306a36Sopenharmony_ci unsigned long i; 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_ci while ((node = rb_first(&rds_cong_tree))) { 40662306a36Sopenharmony_ci map = rb_entry(node, struct rds_cong_map, m_rb_node); 40762306a36Sopenharmony_ci rdsdebug("freeing map %p\n", map); 40862306a36Sopenharmony_ci rb_erase(&map->m_rb_node, &rds_cong_tree); 40962306a36Sopenharmony_ci for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) 41062306a36Sopenharmony_ci free_page(map->m_page_addrs[i]); 41162306a36Sopenharmony_ci kfree(map); 41262306a36Sopenharmony_ci } 41362306a36Sopenharmony_ci} 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci/* 41662306a36Sopenharmony_ci * Allocate a RDS message containing a congestion update. 41762306a36Sopenharmony_ci */ 41862306a36Sopenharmony_cistruct rds_message *rds_cong_update_alloc(struct rds_connection *conn) 41962306a36Sopenharmony_ci{ 42062306a36Sopenharmony_ci struct rds_cong_map *map = conn->c_lcong; 42162306a36Sopenharmony_ci struct rds_message *rm; 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); 42462306a36Sopenharmony_ci if (!IS_ERR(rm)) 42562306a36Sopenharmony_ci rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci return rm; 42862306a36Sopenharmony_ci} 429