18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci * 38c2ecf20Sopenharmony_ci * page_pool.c 48c2ecf20Sopenharmony_ci * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 58c2ecf20Sopenharmony_ci * Copyright (C) 2016 Red Hat, Inc. 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#include <linux/types.h> 98c2ecf20Sopenharmony_ci#include <linux/kernel.h> 108c2ecf20Sopenharmony_ci#include <linux/slab.h> 118c2ecf20Sopenharmony_ci#include <linux/device.h> 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci#include <net/page_pool.h> 148c2ecf20Sopenharmony_ci#include <linux/dma-direction.h> 158c2ecf20Sopenharmony_ci#include <linux/dma-mapping.h> 168c2ecf20Sopenharmony_ci#include <linux/page-flags.h> 178c2ecf20Sopenharmony_ci#include <linux/mm.h> /* for __put_page() */ 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_ci#include <trace/events/page_pool.h> 208c2ecf20Sopenharmony_ci 218c2ecf20Sopenharmony_ci#define DEFER_TIME (msecs_to_jiffies(1000)) 228c2ecf20Sopenharmony_ci#define DEFER_WARN_INTERVAL (60 * HZ) 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_cistatic int page_pool_init(struct page_pool *pool, 258c2ecf20Sopenharmony_ci const struct page_pool_params *params) 268c2ecf20Sopenharmony_ci{ 278c2ecf20Sopenharmony_ci unsigned int ring_qsize = 1024; /* Default */ 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci memcpy(&pool->p, params, sizeof(pool->p)); 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci /* Validate only known flags were used */ 328c2ecf20Sopenharmony_ci if (pool->p.flags & ~(PP_FLAG_ALL)) 338c2ecf20Sopenharmony_ci return -EINVAL; 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci if (pool->p.pool_size) 368c2ecf20Sopenharmony_ci ring_qsize = pool->p.pool_size; 378c2ecf20Sopenharmony_ci 388c2ecf20Sopenharmony_ci /* Sanity limit mem that can be pinned down */ 398c2ecf20Sopenharmony_ci if (ring_qsize > 32768) 408c2ecf20Sopenharmony_ci return -E2BIG; 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 438c2ecf20Sopenharmony_ci * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 448c2ecf20Sopenharmony_ci * which is the XDP_TX use-case. 458c2ecf20Sopenharmony_ci */ 468c2ecf20Sopenharmony_ci if (pool->p.flags & PP_FLAG_DMA_MAP) { 478c2ecf20Sopenharmony_ci if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 488c2ecf20Sopenharmony_ci (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 498c2ecf20Sopenharmony_ci return -EINVAL; 508c2ecf20Sopenharmony_ci } 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { 538c2ecf20Sopenharmony_ci /* In order to request DMA-sync-for-device the page 548c2ecf20Sopenharmony_ci * needs to be mapped 558c2ecf20Sopenharmony_ci */ 568c2ecf20Sopenharmony_ci if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 578c2ecf20Sopenharmony_ci return -EINVAL; 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_ci if (!pool->p.max_len) 608c2ecf20Sopenharmony_ci return -EINVAL; 618c2ecf20Sopenharmony_ci 628c2ecf20Sopenharmony_ci /* pool->p.offset has to be set according to the address 638c2ecf20Sopenharmony_ci * offset used by the DMA engine to start copying rx data 648c2ecf20Sopenharmony_ci */ 658c2ecf20Sopenharmony_ci } 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ci if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) 688c2ecf20Sopenharmony_ci return -ENOMEM; 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci atomic_set(&pool->pages_state_release_cnt, 0); 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci /* Driver calling page_pool_create() also call page_pool_destroy() */ 738c2ecf20Sopenharmony_ci refcount_set(&pool->user_cnt, 1); 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci if (pool->p.flags & PP_FLAG_DMA_MAP) 768c2ecf20Sopenharmony_ci get_device(pool->p.dev); 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci return 0; 798c2ecf20Sopenharmony_ci} 808c2ecf20Sopenharmony_ci 818c2ecf20Sopenharmony_cistruct page_pool *page_pool_create(const struct page_pool_params *params) 828c2ecf20Sopenharmony_ci{ 838c2ecf20Sopenharmony_ci struct page_pool *pool; 848c2ecf20Sopenharmony_ci int err; 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 878c2ecf20Sopenharmony_ci if (!pool) 888c2ecf20Sopenharmony_ci return ERR_PTR(-ENOMEM); 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci err = page_pool_init(pool, params); 918c2ecf20Sopenharmony_ci if (err < 0) { 928c2ecf20Sopenharmony_ci pr_warn("%s() gave up with errno %d\n", __func__, err); 938c2ecf20Sopenharmony_ci kfree(pool); 948c2ecf20Sopenharmony_ci return ERR_PTR(err); 958c2ecf20Sopenharmony_ci } 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci return pool; 988c2ecf20Sopenharmony_ci} 998c2ecf20Sopenharmony_ciEXPORT_SYMBOL(page_pool_create); 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_cistatic void page_pool_return_page(struct page_pool *pool, struct page *page); 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_cinoinline 1048c2ecf20Sopenharmony_cistatic struct page *page_pool_refill_alloc_cache(struct page_pool *pool) 1058c2ecf20Sopenharmony_ci{ 1068c2ecf20Sopenharmony_ci struct ptr_ring *r = &pool->ring; 1078c2ecf20Sopenharmony_ci struct page *page; 1088c2ecf20Sopenharmony_ci int pref_nid; /* preferred NUMA node */ 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_ci /* Quicker fallback, avoid locks when ring is empty */ 1118c2ecf20Sopenharmony_ci if (__ptr_ring_empty(r)) 1128c2ecf20Sopenharmony_ci return NULL; 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ci /* Softirq guarantee CPU and thus NUMA node is stable. This, 1158c2ecf20Sopenharmony_ci * assumes CPU refilling driver RX-ring will also run RX-NAPI. 1168c2ecf20Sopenharmony_ci */ 1178c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 1188c2ecf20Sopenharmony_ci pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; 1198c2ecf20Sopenharmony_ci#else 1208c2ecf20Sopenharmony_ci /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ 1218c2ecf20Sopenharmony_ci pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ 1228c2ecf20Sopenharmony_ci#endif 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci /* Slower-path: Get pages from locked ring queue */ 1258c2ecf20Sopenharmony_ci spin_lock(&r->consumer_lock); 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci /* Refill alloc array, but only if NUMA match */ 1288c2ecf20Sopenharmony_ci do { 1298c2ecf20Sopenharmony_ci page = __ptr_ring_consume(r); 1308c2ecf20Sopenharmony_ci if (unlikely(!page)) 1318c2ecf20Sopenharmony_ci break; 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci if (likely(page_to_nid(page) == pref_nid)) { 1348c2ecf20Sopenharmony_ci pool->alloc.cache[pool->alloc.count++] = page; 1358c2ecf20Sopenharmony_ci } else { 1368c2ecf20Sopenharmony_ci /* NUMA mismatch; 1378c2ecf20Sopenharmony_ci * (1) release 1 page to page-allocator and 1388c2ecf20Sopenharmony_ci * (2) break out to fallthrough to alloc_pages_node. 1398c2ecf20Sopenharmony_ci * This limit stress on page buddy alloactor. 1408c2ecf20Sopenharmony_ci */ 1418c2ecf20Sopenharmony_ci page_pool_return_page(pool, page); 1428c2ecf20Sopenharmony_ci page = NULL; 1438c2ecf20Sopenharmony_ci break; 1448c2ecf20Sopenharmony_ci } 1458c2ecf20Sopenharmony_ci } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci /* Return last page */ 1488c2ecf20Sopenharmony_ci if (likely(pool->alloc.count > 0)) 1498c2ecf20Sopenharmony_ci page = pool->alloc.cache[--pool->alloc.count]; 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci spin_unlock(&r->consumer_lock); 1528c2ecf20Sopenharmony_ci return page; 1538c2ecf20Sopenharmony_ci} 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci/* fast path */ 1568c2ecf20Sopenharmony_cistatic struct page *__page_pool_get_cached(struct page_pool *pool) 1578c2ecf20Sopenharmony_ci{ 1588c2ecf20Sopenharmony_ci struct page *page; 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ci /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ 1618c2ecf20Sopenharmony_ci if (likely(pool->alloc.count)) { 1628c2ecf20Sopenharmony_ci /* Fast-path */ 1638c2ecf20Sopenharmony_ci page = pool->alloc.cache[--pool->alloc.count]; 1648c2ecf20Sopenharmony_ci } else { 1658c2ecf20Sopenharmony_ci page = page_pool_refill_alloc_cache(pool); 1668c2ecf20Sopenharmony_ci } 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci return page; 1698c2ecf20Sopenharmony_ci} 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_cistatic void page_pool_dma_sync_for_device(struct page_pool *pool, 1728c2ecf20Sopenharmony_ci struct page *page, 1738c2ecf20Sopenharmony_ci unsigned int dma_sync_size) 1748c2ecf20Sopenharmony_ci{ 1758c2ecf20Sopenharmony_ci dma_addr_t dma_addr = page_pool_get_dma_addr(page); 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ci dma_sync_size = min(dma_sync_size, pool->p.max_len); 1788c2ecf20Sopenharmony_ci dma_sync_single_range_for_device(pool->p.dev, dma_addr, 1798c2ecf20Sopenharmony_ci pool->p.offset, dma_sync_size, 1808c2ecf20Sopenharmony_ci pool->p.dma_dir); 1818c2ecf20Sopenharmony_ci} 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci/* slow path */ 1848c2ecf20Sopenharmony_cinoinline 1858c2ecf20Sopenharmony_cistatic struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, 1868c2ecf20Sopenharmony_ci gfp_t _gfp) 1878c2ecf20Sopenharmony_ci{ 1888c2ecf20Sopenharmony_ci struct page *page; 1898c2ecf20Sopenharmony_ci gfp_t gfp = _gfp; 1908c2ecf20Sopenharmony_ci dma_addr_t dma; 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_ci /* We could always set __GFP_COMP, and avoid this branch, as 1938c2ecf20Sopenharmony_ci * prep_new_page() can handle order-0 with __GFP_COMP. 1948c2ecf20Sopenharmony_ci */ 1958c2ecf20Sopenharmony_ci if (pool->p.order) 1968c2ecf20Sopenharmony_ci gfp |= __GFP_COMP; 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci /* FUTURE development: 1998c2ecf20Sopenharmony_ci * 2008c2ecf20Sopenharmony_ci * Current slow-path essentially falls back to single page 2018c2ecf20Sopenharmony_ci * allocations, which doesn't improve performance. This code 2028c2ecf20Sopenharmony_ci * need bulk allocation support from the page allocator code. 2038c2ecf20Sopenharmony_ci */ 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci /* Cache was empty, do real allocation */ 2068c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA 2078c2ecf20Sopenharmony_ci page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 2088c2ecf20Sopenharmony_ci#else 2098c2ecf20Sopenharmony_ci page = alloc_pages(gfp, pool->p.order); 2108c2ecf20Sopenharmony_ci#endif 2118c2ecf20Sopenharmony_ci if (!page) 2128c2ecf20Sopenharmony_ci return NULL; 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 2158c2ecf20Sopenharmony_ci goto skip_dma_map; 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_ci /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 2188c2ecf20Sopenharmony_ci * since dma_addr_t can be either 32 or 64 bits and does not always fit 2198c2ecf20Sopenharmony_ci * into page private data (i.e 32bit cpu with 64bit DMA caps) 2208c2ecf20Sopenharmony_ci * This mapping is kept for lifetime of page, until leaving pool. 2218c2ecf20Sopenharmony_ci */ 2228c2ecf20Sopenharmony_ci dma = dma_map_page_attrs(pool->p.dev, page, 0, 2238c2ecf20Sopenharmony_ci (PAGE_SIZE << pool->p.order), 2248c2ecf20Sopenharmony_ci pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); 2258c2ecf20Sopenharmony_ci if (dma_mapping_error(pool->p.dev, dma)) { 2268c2ecf20Sopenharmony_ci put_page(page); 2278c2ecf20Sopenharmony_ci return NULL; 2288c2ecf20Sopenharmony_ci } 2298c2ecf20Sopenharmony_ci page_pool_set_dma_addr(page, dma); 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 2328c2ecf20Sopenharmony_ci page_pool_dma_sync_for_device(pool, page, pool->p.max_len); 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ciskip_dma_map: 2358c2ecf20Sopenharmony_ci /* Track how many pages are held 'in-flight' */ 2368c2ecf20Sopenharmony_ci pool->pages_state_hold_cnt++; 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci /* When page just alloc'ed is should/must have refcnt 1. */ 2418c2ecf20Sopenharmony_ci return page; 2428c2ecf20Sopenharmony_ci} 2438c2ecf20Sopenharmony_ci 2448c2ecf20Sopenharmony_ci/* For using page_pool replace: alloc_pages() API calls, but provide 2458c2ecf20Sopenharmony_ci * synchronization guarantee for allocation side. 2468c2ecf20Sopenharmony_ci */ 2478c2ecf20Sopenharmony_cistruct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 2488c2ecf20Sopenharmony_ci{ 2498c2ecf20Sopenharmony_ci struct page *page; 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci /* Fast-path: Get a page from cache */ 2528c2ecf20Sopenharmony_ci page = __page_pool_get_cached(pool); 2538c2ecf20Sopenharmony_ci if (page) 2548c2ecf20Sopenharmony_ci return page; 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci /* Slow-path: cache empty, do real allocation */ 2578c2ecf20Sopenharmony_ci page = __page_pool_alloc_pages_slow(pool, gfp); 2588c2ecf20Sopenharmony_ci return page; 2598c2ecf20Sopenharmony_ci} 2608c2ecf20Sopenharmony_ciEXPORT_SYMBOL(page_pool_alloc_pages); 2618c2ecf20Sopenharmony_ci 2628c2ecf20Sopenharmony_ci/* Calculate distance between two u32 values, valid if distance is below 2^(31) 2638c2ecf20Sopenharmony_ci * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution 2648c2ecf20Sopenharmony_ci */ 2658c2ecf20Sopenharmony_ci#define _distance(a, b) (s32)((a) - (b)) 2668c2ecf20Sopenharmony_ci 2678c2ecf20Sopenharmony_cistatic s32 page_pool_inflight(struct page_pool *pool) 2688c2ecf20Sopenharmony_ci{ 2698c2ecf20Sopenharmony_ci u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 2708c2ecf20Sopenharmony_ci u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 2718c2ecf20Sopenharmony_ci s32 inflight; 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci inflight = _distance(hold_cnt, release_cnt); 2748c2ecf20Sopenharmony_ci 2758c2ecf20Sopenharmony_ci trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); 2768c2ecf20Sopenharmony_ci WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci return inflight; 2798c2ecf20Sopenharmony_ci} 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci/* Disconnects a page (from a page_pool). API users can have a need 2828c2ecf20Sopenharmony_ci * to disconnect a page (from a page_pool), to allow it to be used as 2838c2ecf20Sopenharmony_ci * a regular page (that will eventually be returned to the normal 2848c2ecf20Sopenharmony_ci * page-allocator via put_page). 2858c2ecf20Sopenharmony_ci */ 2868c2ecf20Sopenharmony_civoid page_pool_release_page(struct page_pool *pool, struct page *page) 2878c2ecf20Sopenharmony_ci{ 2888c2ecf20Sopenharmony_ci dma_addr_t dma; 2898c2ecf20Sopenharmony_ci int count; 2908c2ecf20Sopenharmony_ci 2918c2ecf20Sopenharmony_ci if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 2928c2ecf20Sopenharmony_ci /* Always account for inflight pages, even if we didn't 2938c2ecf20Sopenharmony_ci * map them 2948c2ecf20Sopenharmony_ci */ 2958c2ecf20Sopenharmony_ci goto skip_dma_unmap; 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_ci dma = page_pool_get_dma_addr(page); 2988c2ecf20Sopenharmony_ci 2998c2ecf20Sopenharmony_ci /* When page is unmapped, it cannot be returned to our pool */ 3008c2ecf20Sopenharmony_ci dma_unmap_page_attrs(pool->p.dev, dma, 3018c2ecf20Sopenharmony_ci PAGE_SIZE << pool->p.order, pool->p.dma_dir, 3028c2ecf20Sopenharmony_ci DMA_ATTR_SKIP_CPU_SYNC); 3038c2ecf20Sopenharmony_ci page_pool_set_dma_addr(page, 0); 3048c2ecf20Sopenharmony_ciskip_dma_unmap: 3058c2ecf20Sopenharmony_ci /* This may be the last page returned, releasing the pool, so 3068c2ecf20Sopenharmony_ci * it is not safe to reference pool afterwards. 3078c2ecf20Sopenharmony_ci */ 3088c2ecf20Sopenharmony_ci count = atomic_inc_return(&pool->pages_state_release_cnt); 3098c2ecf20Sopenharmony_ci trace_page_pool_state_release(pool, page, count); 3108c2ecf20Sopenharmony_ci} 3118c2ecf20Sopenharmony_ciEXPORT_SYMBOL(page_pool_release_page); 3128c2ecf20Sopenharmony_ci 3138c2ecf20Sopenharmony_ci/* Return a page to the page allocator, cleaning up our state */ 3148c2ecf20Sopenharmony_cistatic void page_pool_return_page(struct page_pool *pool, struct page *page) 3158c2ecf20Sopenharmony_ci{ 3168c2ecf20Sopenharmony_ci page_pool_release_page(pool, page); 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ci put_page(page); 3198c2ecf20Sopenharmony_ci /* An optimization would be to call __free_pages(page, pool->p.order) 3208c2ecf20Sopenharmony_ci * knowing page is not part of page-cache (thus avoiding a 3218c2ecf20Sopenharmony_ci * __page_cache_release() call). 3228c2ecf20Sopenharmony_ci */ 3238c2ecf20Sopenharmony_ci} 3248c2ecf20Sopenharmony_ci 3258c2ecf20Sopenharmony_cistatic bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) 3268c2ecf20Sopenharmony_ci{ 3278c2ecf20Sopenharmony_ci int ret; 3288c2ecf20Sopenharmony_ci /* BH protection not needed if current is serving softirq */ 3298c2ecf20Sopenharmony_ci if (in_serving_softirq()) 3308c2ecf20Sopenharmony_ci ret = ptr_ring_produce(&pool->ring, page); 3318c2ecf20Sopenharmony_ci else 3328c2ecf20Sopenharmony_ci ret = ptr_ring_produce_bh(&pool->ring, page); 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_ci return (ret == 0) ? true : false; 3358c2ecf20Sopenharmony_ci} 3368c2ecf20Sopenharmony_ci 3378c2ecf20Sopenharmony_ci/* Only allow direct recycling in special circumstances, into the 3388c2ecf20Sopenharmony_ci * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 3398c2ecf20Sopenharmony_ci * 3408c2ecf20Sopenharmony_ci * Caller must provide appropriate safe context. 3418c2ecf20Sopenharmony_ci */ 3428c2ecf20Sopenharmony_cistatic bool page_pool_recycle_in_cache(struct page *page, 3438c2ecf20Sopenharmony_ci struct page_pool *pool) 3448c2ecf20Sopenharmony_ci{ 3458c2ecf20Sopenharmony_ci if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) 3468c2ecf20Sopenharmony_ci return false; 3478c2ecf20Sopenharmony_ci 3488c2ecf20Sopenharmony_ci /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 3498c2ecf20Sopenharmony_ci pool->alloc.cache[pool->alloc.count++] = page; 3508c2ecf20Sopenharmony_ci return true; 3518c2ecf20Sopenharmony_ci} 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci/* page is NOT reusable when: 3548c2ecf20Sopenharmony_ci * 1) allocated when system is under some pressure. (page_is_pfmemalloc) 3558c2ecf20Sopenharmony_ci */ 3568c2ecf20Sopenharmony_cistatic bool pool_page_reusable(struct page_pool *pool, struct page *page) 3578c2ecf20Sopenharmony_ci{ 3588c2ecf20Sopenharmony_ci return !page_is_pfmemalloc(page); 3598c2ecf20Sopenharmony_ci} 3608c2ecf20Sopenharmony_ci 3618c2ecf20Sopenharmony_ci/* If the page refcnt == 1, this will try to recycle the page. 3628c2ecf20Sopenharmony_ci * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for 3638c2ecf20Sopenharmony_ci * the configured size min(dma_sync_size, pool->max_len). 3648c2ecf20Sopenharmony_ci * If the page refcnt != 1, then the page will be returned to memory 3658c2ecf20Sopenharmony_ci * subsystem. 3668c2ecf20Sopenharmony_ci */ 3678c2ecf20Sopenharmony_civoid page_pool_put_page(struct page_pool *pool, struct page *page, 3688c2ecf20Sopenharmony_ci unsigned int dma_sync_size, bool allow_direct) 3698c2ecf20Sopenharmony_ci{ 3708c2ecf20Sopenharmony_ci /* This allocator is optimized for the XDP mode that uses 3718c2ecf20Sopenharmony_ci * one-frame-per-page, but have fallbacks that act like the 3728c2ecf20Sopenharmony_ci * regular page allocator APIs. 3738c2ecf20Sopenharmony_ci * 3748c2ecf20Sopenharmony_ci * refcnt == 1 means page_pool owns page, and can recycle it. 3758c2ecf20Sopenharmony_ci */ 3768c2ecf20Sopenharmony_ci if (likely(page_ref_count(page) == 1 && 3778c2ecf20Sopenharmony_ci pool_page_reusable(pool, page))) { 3788c2ecf20Sopenharmony_ci /* Read barrier done in page_ref_count / READ_ONCE */ 3798c2ecf20Sopenharmony_ci 3808c2ecf20Sopenharmony_ci if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 3818c2ecf20Sopenharmony_ci page_pool_dma_sync_for_device(pool, page, 3828c2ecf20Sopenharmony_ci dma_sync_size); 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_ci if (allow_direct && in_serving_softirq()) 3858c2ecf20Sopenharmony_ci if (page_pool_recycle_in_cache(page, pool)) 3868c2ecf20Sopenharmony_ci return; 3878c2ecf20Sopenharmony_ci 3888c2ecf20Sopenharmony_ci if (!page_pool_recycle_in_ring(pool, page)) { 3898c2ecf20Sopenharmony_ci /* Cache full, fallback to free pages */ 3908c2ecf20Sopenharmony_ci page_pool_return_page(pool, page); 3918c2ecf20Sopenharmony_ci } 3928c2ecf20Sopenharmony_ci return; 3938c2ecf20Sopenharmony_ci } 3948c2ecf20Sopenharmony_ci /* Fallback/non-XDP mode: API user have elevated refcnt. 3958c2ecf20Sopenharmony_ci * 3968c2ecf20Sopenharmony_ci * Many drivers split up the page into fragments, and some 3978c2ecf20Sopenharmony_ci * want to keep doing this to save memory and do refcnt based 3988c2ecf20Sopenharmony_ci * recycling. Support this use case too, to ease drivers 3998c2ecf20Sopenharmony_ci * switching between XDP/non-XDP. 4008c2ecf20Sopenharmony_ci * 4018c2ecf20Sopenharmony_ci * In-case page_pool maintains the DMA mapping, API user must 4028c2ecf20Sopenharmony_ci * call page_pool_put_page once. In this elevated refcnt 4038c2ecf20Sopenharmony_ci * case, the DMA is unmapped/released, as driver is likely 4048c2ecf20Sopenharmony_ci * doing refcnt based recycle tricks, meaning another process 4058c2ecf20Sopenharmony_ci * will be invoking put_page. 4068c2ecf20Sopenharmony_ci */ 4078c2ecf20Sopenharmony_ci /* Do not replace this with page_pool_return_page() */ 4088c2ecf20Sopenharmony_ci page_pool_release_page(pool, page); 4098c2ecf20Sopenharmony_ci put_page(page); 4108c2ecf20Sopenharmony_ci} 4118c2ecf20Sopenharmony_ciEXPORT_SYMBOL(page_pool_put_page); 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_cistatic void page_pool_empty_ring(struct page_pool *pool) 4148c2ecf20Sopenharmony_ci{ 4158c2ecf20Sopenharmony_ci struct page *page; 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci /* Empty recycle ring */ 4188c2ecf20Sopenharmony_ci while ((page = ptr_ring_consume_bh(&pool->ring))) { 4198c2ecf20Sopenharmony_ci /* Verify the refcnt invariant of cached pages */ 4208c2ecf20Sopenharmony_ci if (!(page_ref_count(page) == 1)) 4218c2ecf20Sopenharmony_ci pr_crit("%s() page_pool refcnt %d violation\n", 4228c2ecf20Sopenharmony_ci __func__, page_ref_count(page)); 4238c2ecf20Sopenharmony_ci 4248c2ecf20Sopenharmony_ci page_pool_return_page(pool, page); 4258c2ecf20Sopenharmony_ci } 4268c2ecf20Sopenharmony_ci} 4278c2ecf20Sopenharmony_ci 4288c2ecf20Sopenharmony_cistatic void page_pool_free(struct page_pool *pool) 4298c2ecf20Sopenharmony_ci{ 4308c2ecf20Sopenharmony_ci if (pool->disconnect) 4318c2ecf20Sopenharmony_ci pool->disconnect(pool); 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci ptr_ring_cleanup(&pool->ring, NULL); 4348c2ecf20Sopenharmony_ci 4358c2ecf20Sopenharmony_ci if (pool->p.flags & PP_FLAG_DMA_MAP) 4368c2ecf20Sopenharmony_ci put_device(pool->p.dev); 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci kfree(pool); 4398c2ecf20Sopenharmony_ci} 4408c2ecf20Sopenharmony_ci 4418c2ecf20Sopenharmony_cistatic void page_pool_empty_alloc_cache_once(struct page_pool *pool) 4428c2ecf20Sopenharmony_ci{ 4438c2ecf20Sopenharmony_ci struct page *page; 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci if (pool->destroy_cnt) 4468c2ecf20Sopenharmony_ci return; 4478c2ecf20Sopenharmony_ci 4488c2ecf20Sopenharmony_ci /* Empty alloc cache, assume caller made sure this is 4498c2ecf20Sopenharmony_ci * no-longer in use, and page_pool_alloc_pages() cannot be 4508c2ecf20Sopenharmony_ci * call concurrently. 4518c2ecf20Sopenharmony_ci */ 4528c2ecf20Sopenharmony_ci while (pool->alloc.count) { 4538c2ecf20Sopenharmony_ci page = pool->alloc.cache[--pool->alloc.count]; 4548c2ecf20Sopenharmony_ci page_pool_return_page(pool, page); 4558c2ecf20Sopenharmony_ci } 4568c2ecf20Sopenharmony_ci} 4578c2ecf20Sopenharmony_ci 4588c2ecf20Sopenharmony_cistatic void page_pool_scrub(struct page_pool *pool) 4598c2ecf20Sopenharmony_ci{ 4608c2ecf20Sopenharmony_ci page_pool_empty_alloc_cache_once(pool); 4618c2ecf20Sopenharmony_ci pool->destroy_cnt++; 4628c2ecf20Sopenharmony_ci 4638c2ecf20Sopenharmony_ci /* No more consumers should exist, but producers could still 4648c2ecf20Sopenharmony_ci * be in-flight. 4658c2ecf20Sopenharmony_ci */ 4668c2ecf20Sopenharmony_ci page_pool_empty_ring(pool); 4678c2ecf20Sopenharmony_ci} 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_cistatic int page_pool_release(struct page_pool *pool) 4708c2ecf20Sopenharmony_ci{ 4718c2ecf20Sopenharmony_ci int inflight; 4728c2ecf20Sopenharmony_ci 4738c2ecf20Sopenharmony_ci page_pool_scrub(pool); 4748c2ecf20Sopenharmony_ci inflight = page_pool_inflight(pool); 4758c2ecf20Sopenharmony_ci if (!inflight) 4768c2ecf20Sopenharmony_ci page_pool_free(pool); 4778c2ecf20Sopenharmony_ci 4788c2ecf20Sopenharmony_ci return inflight; 4798c2ecf20Sopenharmony_ci} 4808c2ecf20Sopenharmony_ci 4818c2ecf20Sopenharmony_cistatic void page_pool_release_retry(struct work_struct *wq) 4828c2ecf20Sopenharmony_ci{ 4838c2ecf20Sopenharmony_ci struct delayed_work *dwq = to_delayed_work(wq); 4848c2ecf20Sopenharmony_ci struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); 4858c2ecf20Sopenharmony_ci int inflight; 4868c2ecf20Sopenharmony_ci 4878c2ecf20Sopenharmony_ci inflight = page_pool_release(pool); 4888c2ecf20Sopenharmony_ci if (!inflight) 4898c2ecf20Sopenharmony_ci return; 4908c2ecf20Sopenharmony_ci 4918c2ecf20Sopenharmony_ci /* Periodic warning */ 4928c2ecf20Sopenharmony_ci if (time_after_eq(jiffies, pool->defer_warn)) { 4938c2ecf20Sopenharmony_ci int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; 4948c2ecf20Sopenharmony_ci 4958c2ecf20Sopenharmony_ci pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", 4968c2ecf20Sopenharmony_ci __func__, inflight, sec); 4978c2ecf20Sopenharmony_ci pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 4988c2ecf20Sopenharmony_ci } 4998c2ecf20Sopenharmony_ci 5008c2ecf20Sopenharmony_ci /* Still not ready to be disconnected, retry later */ 5018c2ecf20Sopenharmony_ci schedule_delayed_work(&pool->release_dw, DEFER_TIME); 5028c2ecf20Sopenharmony_ci} 5038c2ecf20Sopenharmony_ci 5048c2ecf20Sopenharmony_civoid page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) 5058c2ecf20Sopenharmony_ci{ 5068c2ecf20Sopenharmony_ci refcount_inc(&pool->user_cnt); 5078c2ecf20Sopenharmony_ci pool->disconnect = disconnect; 5088c2ecf20Sopenharmony_ci} 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_civoid page_pool_destroy(struct page_pool *pool) 5118c2ecf20Sopenharmony_ci{ 5128c2ecf20Sopenharmony_ci if (!pool) 5138c2ecf20Sopenharmony_ci return; 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci if (!page_pool_put(pool)) 5168c2ecf20Sopenharmony_ci return; 5178c2ecf20Sopenharmony_ci 5188c2ecf20Sopenharmony_ci if (!page_pool_release(pool)) 5198c2ecf20Sopenharmony_ci return; 5208c2ecf20Sopenharmony_ci 5218c2ecf20Sopenharmony_ci pool->defer_start = jiffies; 5228c2ecf20Sopenharmony_ci pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); 5258c2ecf20Sopenharmony_ci schedule_delayed_work(&pool->release_dw, DEFER_TIME); 5268c2ecf20Sopenharmony_ci} 5278c2ecf20Sopenharmony_ciEXPORT_SYMBOL(page_pool_destroy); 5288c2ecf20Sopenharmony_ci 5298c2ecf20Sopenharmony_ci/* Caller must provide appropriate safe context, e.g. NAPI. */ 5308c2ecf20Sopenharmony_civoid page_pool_update_nid(struct page_pool *pool, int new_nid) 5318c2ecf20Sopenharmony_ci{ 5328c2ecf20Sopenharmony_ci struct page *page; 5338c2ecf20Sopenharmony_ci 5348c2ecf20Sopenharmony_ci trace_page_pool_update_nid(pool, new_nid); 5358c2ecf20Sopenharmony_ci pool->p.nid = new_nid; 5368c2ecf20Sopenharmony_ci 5378c2ecf20Sopenharmony_ci /* Flush pool alloc cache, as refill will check NUMA node */ 5388c2ecf20Sopenharmony_ci while (pool->alloc.count) { 5398c2ecf20Sopenharmony_ci page = pool->alloc.cache[--pool->alloc.count]; 5408c2ecf20Sopenharmony_ci page_pool_return_page(pool, page); 5418c2ecf20Sopenharmony_ci } 5428c2ecf20Sopenharmony_ci} 5438c2ecf20Sopenharmony_ciEXPORT_SYMBOL(page_pool_update_nid); 544