18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (C) 2008 Oracle. All rights reserved. 48c2ecf20Sopenharmony_ci */ 58c2ecf20Sopenharmony_ci 68c2ecf20Sopenharmony_ci#include <linux/sched.h> 78c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 88c2ecf20Sopenharmony_ci#include <linux/spinlock.h> 98c2ecf20Sopenharmony_ci#include <linux/page-flags.h> 108c2ecf20Sopenharmony_ci#include <asm/bug.h> 118c2ecf20Sopenharmony_ci#include "misc.h" 128c2ecf20Sopenharmony_ci#include "ctree.h" 138c2ecf20Sopenharmony_ci#include "extent_io.h" 148c2ecf20Sopenharmony_ci#include "locking.h" 158c2ecf20Sopenharmony_ci 168c2ecf20Sopenharmony_ci/* 178c2ecf20Sopenharmony_ci * Extent buffer locking 188c2ecf20Sopenharmony_ci * ===================== 198c2ecf20Sopenharmony_ci * 208c2ecf20Sopenharmony_ci * The locks use a custom scheme that allows to do more operations than are 218c2ecf20Sopenharmony_ci * available fromt current locking primitives. The building blocks are still 228c2ecf20Sopenharmony_ci * rwlock and wait queues. 238c2ecf20Sopenharmony_ci * 248c2ecf20Sopenharmony_ci * Required semantics: 258c2ecf20Sopenharmony_ci * 268c2ecf20Sopenharmony_ci * - reader/writer exclusion 278c2ecf20Sopenharmony_ci * - writer/writer exclusion 288c2ecf20Sopenharmony_ci * - reader/reader sharing 298c2ecf20Sopenharmony_ci * - spinning lock semantics 308c2ecf20Sopenharmony_ci * - blocking lock semantics 318c2ecf20Sopenharmony_ci * - try-lock semantics for readers and writers 328c2ecf20Sopenharmony_ci * - one level nesting, allowing read lock to be taken by the same thread that 338c2ecf20Sopenharmony_ci * already has write lock 348c2ecf20Sopenharmony_ci * 358c2ecf20Sopenharmony_ci * The extent buffer locks (also called tree locks) manage access to eb data 368c2ecf20Sopenharmony_ci * related to the storage in the b-tree (keys, items, but not the individual 378c2ecf20Sopenharmony_ci * members of eb). 388c2ecf20Sopenharmony_ci * We want concurrency of many readers and safe updates. The underlying locking 398c2ecf20Sopenharmony_ci * is done by read-write spinlock and the blocking part is implemented using 408c2ecf20Sopenharmony_ci * counters and wait queues. 418c2ecf20Sopenharmony_ci * 428c2ecf20Sopenharmony_ci * spinning semantics - the low-level rwlock is held so all other threads that 438c2ecf20Sopenharmony_ci * want to take it are spinning on it. 448c2ecf20Sopenharmony_ci * 458c2ecf20Sopenharmony_ci * blocking semantics - the low-level rwlock is not held but the counter 468c2ecf20Sopenharmony_ci * denotes how many times the blocking lock was held; 478c2ecf20Sopenharmony_ci * sleeping is possible 488c2ecf20Sopenharmony_ci * 498c2ecf20Sopenharmony_ci * Write lock always allows only one thread to access the data. 508c2ecf20Sopenharmony_ci * 518c2ecf20Sopenharmony_ci * 528c2ecf20Sopenharmony_ci * Debugging 538c2ecf20Sopenharmony_ci * --------- 548c2ecf20Sopenharmony_ci * 558c2ecf20Sopenharmony_ci * There are additional state counters that are asserted in various contexts, 568c2ecf20Sopenharmony_ci * removed from non-debug build to reduce extent_buffer size and for 578c2ecf20Sopenharmony_ci * performance reasons. 588c2ecf20Sopenharmony_ci * 598c2ecf20Sopenharmony_ci * 608c2ecf20Sopenharmony_ci * Lock recursion 618c2ecf20Sopenharmony_ci * -------------- 628c2ecf20Sopenharmony_ci * 638c2ecf20Sopenharmony_ci * A write operation on a tree might indirectly start a look up on the same 648c2ecf20Sopenharmony_ci * tree. This can happen when btrfs_cow_block locks the tree and needs to 658c2ecf20Sopenharmony_ci * lookup free extents. 668c2ecf20Sopenharmony_ci * 678c2ecf20Sopenharmony_ci * btrfs_cow_block 688c2ecf20Sopenharmony_ci * .. 698c2ecf20Sopenharmony_ci * alloc_tree_block_no_bg_flush 708c2ecf20Sopenharmony_ci * btrfs_alloc_tree_block 718c2ecf20Sopenharmony_ci * btrfs_reserve_extent 728c2ecf20Sopenharmony_ci * .. 738c2ecf20Sopenharmony_ci * load_free_space_cache 748c2ecf20Sopenharmony_ci * .. 758c2ecf20Sopenharmony_ci * btrfs_lookup_file_extent 768c2ecf20Sopenharmony_ci * btrfs_search_slot 778c2ecf20Sopenharmony_ci * 788c2ecf20Sopenharmony_ci * 798c2ecf20Sopenharmony_ci * Locking pattern - spinning 808c2ecf20Sopenharmony_ci * -------------------------- 818c2ecf20Sopenharmony_ci * 828c2ecf20Sopenharmony_ci * The simple locking scenario, the +--+ denotes the spinning section. 838c2ecf20Sopenharmony_ci * 848c2ecf20Sopenharmony_ci * +- btrfs_tree_lock 858c2ecf20Sopenharmony_ci * | - extent_buffer::rwlock is held 868c2ecf20Sopenharmony_ci * | - no heavy operations should happen, eg. IO, memory allocations, large 878c2ecf20Sopenharmony_ci * | structure traversals 888c2ecf20Sopenharmony_ci * +- btrfs_tree_unock 898c2ecf20Sopenharmony_ci* 908c2ecf20Sopenharmony_ci* 918c2ecf20Sopenharmony_ci * Locking pattern - blocking 928c2ecf20Sopenharmony_ci * -------------------------- 938c2ecf20Sopenharmony_ci * 948c2ecf20Sopenharmony_ci * The blocking write uses the following scheme. The +--+ denotes the spinning 958c2ecf20Sopenharmony_ci * section. 968c2ecf20Sopenharmony_ci * 978c2ecf20Sopenharmony_ci * +- btrfs_tree_lock 988c2ecf20Sopenharmony_ci * | 998c2ecf20Sopenharmony_ci * +- btrfs_set_lock_blocking_write 1008c2ecf20Sopenharmony_ci * 1018c2ecf20Sopenharmony_ci * - allowed: IO, memory allocations, etc. 1028c2ecf20Sopenharmony_ci * 1038c2ecf20Sopenharmony_ci * -- btrfs_tree_unlock - note, no explicit unblocking necessary 1048c2ecf20Sopenharmony_ci * 1058c2ecf20Sopenharmony_ci * 1068c2ecf20Sopenharmony_ci * Blocking read is similar. 1078c2ecf20Sopenharmony_ci * 1088c2ecf20Sopenharmony_ci * +- btrfs_tree_read_lock 1098c2ecf20Sopenharmony_ci * | 1108c2ecf20Sopenharmony_ci * +- btrfs_set_lock_blocking_read 1118c2ecf20Sopenharmony_ci * 1128c2ecf20Sopenharmony_ci * - heavy operations allowed 1138c2ecf20Sopenharmony_ci * 1148c2ecf20Sopenharmony_ci * +- btrfs_tree_read_unlock_blocking 1158c2ecf20Sopenharmony_ci * | 1168c2ecf20Sopenharmony_ci * +- btrfs_tree_read_unlock 1178c2ecf20Sopenharmony_ci * 1188c2ecf20Sopenharmony_ci */ 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci#ifdef CONFIG_BTRFS_DEBUG 1218c2ecf20Sopenharmony_cistatic inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) 1228c2ecf20Sopenharmony_ci{ 1238c2ecf20Sopenharmony_ci WARN_ON(eb->spinning_writers); 1248c2ecf20Sopenharmony_ci eb->spinning_writers++; 1258c2ecf20Sopenharmony_ci} 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_cistatic inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) 1288c2ecf20Sopenharmony_ci{ 1298c2ecf20Sopenharmony_ci WARN_ON(eb->spinning_writers != 1); 1308c2ecf20Sopenharmony_ci eb->spinning_writers--; 1318c2ecf20Sopenharmony_ci} 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_cistatic inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) 1348c2ecf20Sopenharmony_ci{ 1358c2ecf20Sopenharmony_ci WARN_ON(eb->spinning_writers); 1368c2ecf20Sopenharmony_ci} 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_cistatic inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) 1398c2ecf20Sopenharmony_ci{ 1408c2ecf20Sopenharmony_ci atomic_inc(&eb->spinning_readers); 1418c2ecf20Sopenharmony_ci} 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_cistatic inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) 1448c2ecf20Sopenharmony_ci{ 1458c2ecf20Sopenharmony_ci WARN_ON(atomic_read(&eb->spinning_readers) == 0); 1468c2ecf20Sopenharmony_ci atomic_dec(&eb->spinning_readers); 1478c2ecf20Sopenharmony_ci} 1488c2ecf20Sopenharmony_ci 1498c2ecf20Sopenharmony_cistatic inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) 1508c2ecf20Sopenharmony_ci{ 1518c2ecf20Sopenharmony_ci atomic_inc(&eb->read_locks); 1528c2ecf20Sopenharmony_ci} 1538c2ecf20Sopenharmony_ci 1548c2ecf20Sopenharmony_cistatic inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) 1558c2ecf20Sopenharmony_ci{ 1568c2ecf20Sopenharmony_ci atomic_dec(&eb->read_locks); 1578c2ecf20Sopenharmony_ci} 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_cistatic inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) 1608c2ecf20Sopenharmony_ci{ 1618c2ecf20Sopenharmony_ci BUG_ON(!atomic_read(&eb->read_locks)); 1628c2ecf20Sopenharmony_ci} 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_cistatic inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) 1658c2ecf20Sopenharmony_ci{ 1668c2ecf20Sopenharmony_ci eb->write_locks++; 1678c2ecf20Sopenharmony_ci} 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_cistatic inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) 1708c2ecf20Sopenharmony_ci{ 1718c2ecf20Sopenharmony_ci eb->write_locks--; 1728c2ecf20Sopenharmony_ci} 1738c2ecf20Sopenharmony_ci 1748c2ecf20Sopenharmony_ci#else 1758c2ecf20Sopenharmony_cistatic void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { } 1768c2ecf20Sopenharmony_cistatic void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { } 1778c2ecf20Sopenharmony_cistatic void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { } 1788c2ecf20Sopenharmony_cistatic void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { } 1798c2ecf20Sopenharmony_cistatic void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { } 1808c2ecf20Sopenharmony_cistatic void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } 1818c2ecf20Sopenharmony_cistatic void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { } 1828c2ecf20Sopenharmony_cistatic void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { } 1838c2ecf20Sopenharmony_cistatic void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { } 1848c2ecf20Sopenharmony_cistatic void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { } 1858c2ecf20Sopenharmony_ci#endif 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci/* 1888c2ecf20Sopenharmony_ci * Mark already held read lock as blocking. Can be nested in write lock by the 1898c2ecf20Sopenharmony_ci * same thread. 1908c2ecf20Sopenharmony_ci * 1918c2ecf20Sopenharmony_ci * Use when there are potentially long operations ahead so other thread waiting 1928c2ecf20Sopenharmony_ci * on the lock will not actively spin but sleep instead. 1938c2ecf20Sopenharmony_ci * 1948c2ecf20Sopenharmony_ci * The rwlock is released and blocking reader counter is increased. 1958c2ecf20Sopenharmony_ci */ 1968c2ecf20Sopenharmony_civoid btrfs_set_lock_blocking_read(struct extent_buffer *eb) 1978c2ecf20Sopenharmony_ci{ 1988c2ecf20Sopenharmony_ci trace_btrfs_set_lock_blocking_read(eb); 1998c2ecf20Sopenharmony_ci /* 2008c2ecf20Sopenharmony_ci * No lock is required. The lock owner may change if we have a read 2018c2ecf20Sopenharmony_ci * lock, but it won't change to or away from us. If we have the write 2028c2ecf20Sopenharmony_ci * lock, we are the owner and it'll never change. 2038c2ecf20Sopenharmony_ci */ 2048c2ecf20Sopenharmony_ci if (eb->lock_recursed && current->pid == eb->lock_owner) 2058c2ecf20Sopenharmony_ci return; 2068c2ecf20Sopenharmony_ci btrfs_assert_tree_read_locked(eb); 2078c2ecf20Sopenharmony_ci atomic_inc(&eb->blocking_readers); 2088c2ecf20Sopenharmony_ci btrfs_assert_spinning_readers_put(eb); 2098c2ecf20Sopenharmony_ci read_unlock(&eb->lock); 2108c2ecf20Sopenharmony_ci} 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci/* 2138c2ecf20Sopenharmony_ci * Mark already held write lock as blocking. 2148c2ecf20Sopenharmony_ci * 2158c2ecf20Sopenharmony_ci * Use when there are potentially long operations ahead so other threads 2168c2ecf20Sopenharmony_ci * waiting on the lock will not actively spin but sleep instead. 2178c2ecf20Sopenharmony_ci * 2188c2ecf20Sopenharmony_ci * The rwlock is released and blocking writers is set. 2198c2ecf20Sopenharmony_ci */ 2208c2ecf20Sopenharmony_civoid btrfs_set_lock_blocking_write(struct extent_buffer *eb) 2218c2ecf20Sopenharmony_ci{ 2228c2ecf20Sopenharmony_ci trace_btrfs_set_lock_blocking_write(eb); 2238c2ecf20Sopenharmony_ci /* 2248c2ecf20Sopenharmony_ci * No lock is required. The lock owner may change if we have a read 2258c2ecf20Sopenharmony_ci * lock, but it won't change to or away from us. If we have the write 2268c2ecf20Sopenharmony_ci * lock, we are the owner and it'll never change. 2278c2ecf20Sopenharmony_ci */ 2288c2ecf20Sopenharmony_ci if (eb->lock_recursed && current->pid == eb->lock_owner) 2298c2ecf20Sopenharmony_ci return; 2308c2ecf20Sopenharmony_ci if (eb->blocking_writers == 0) { 2318c2ecf20Sopenharmony_ci btrfs_assert_spinning_writers_put(eb); 2328c2ecf20Sopenharmony_ci btrfs_assert_tree_locked(eb); 2338c2ecf20Sopenharmony_ci WRITE_ONCE(eb->blocking_writers, 1); 2348c2ecf20Sopenharmony_ci write_unlock(&eb->lock); 2358c2ecf20Sopenharmony_ci } 2368c2ecf20Sopenharmony_ci} 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci/* 2398c2ecf20Sopenharmony_ci * Lock the extent buffer for read. Wait for any writers (spinning or blocking). 2408c2ecf20Sopenharmony_ci * Can be nested in write lock by the same thread. 2418c2ecf20Sopenharmony_ci * 2428c2ecf20Sopenharmony_ci * Use when the locked section does only lightweight actions and busy waiting 2438c2ecf20Sopenharmony_ci * would be cheaper than making other threads do the wait/wake loop. 2448c2ecf20Sopenharmony_ci * 2458c2ecf20Sopenharmony_ci * The rwlock is held upon exit. 2468c2ecf20Sopenharmony_ci */ 2478c2ecf20Sopenharmony_civoid __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, 2488c2ecf20Sopenharmony_ci bool recurse) 2498c2ecf20Sopenharmony_ci{ 2508c2ecf20Sopenharmony_ci u64 start_ns = 0; 2518c2ecf20Sopenharmony_ci 2528c2ecf20Sopenharmony_ci if (trace_btrfs_tree_read_lock_enabled()) 2538c2ecf20Sopenharmony_ci start_ns = ktime_get_ns(); 2548c2ecf20Sopenharmony_ciagain: 2558c2ecf20Sopenharmony_ci read_lock(&eb->lock); 2568c2ecf20Sopenharmony_ci BUG_ON(eb->blocking_writers == 0 && 2578c2ecf20Sopenharmony_ci current->pid == eb->lock_owner); 2588c2ecf20Sopenharmony_ci if (eb->blocking_writers) { 2598c2ecf20Sopenharmony_ci if (current->pid == eb->lock_owner) { 2608c2ecf20Sopenharmony_ci /* 2618c2ecf20Sopenharmony_ci * This extent is already write-locked by our thread. 2628c2ecf20Sopenharmony_ci * We allow an additional read lock to be added because 2638c2ecf20Sopenharmony_ci * it's for the same thread. btrfs_find_all_roots() 2648c2ecf20Sopenharmony_ci * depends on this as it may be called on a partly 2658c2ecf20Sopenharmony_ci * (write-)locked tree. 2668c2ecf20Sopenharmony_ci */ 2678c2ecf20Sopenharmony_ci WARN_ON(!recurse); 2688c2ecf20Sopenharmony_ci BUG_ON(eb->lock_recursed); 2698c2ecf20Sopenharmony_ci eb->lock_recursed = true; 2708c2ecf20Sopenharmony_ci read_unlock(&eb->lock); 2718c2ecf20Sopenharmony_ci trace_btrfs_tree_read_lock(eb, start_ns); 2728c2ecf20Sopenharmony_ci return; 2738c2ecf20Sopenharmony_ci } 2748c2ecf20Sopenharmony_ci read_unlock(&eb->lock); 2758c2ecf20Sopenharmony_ci wait_event(eb->write_lock_wq, 2768c2ecf20Sopenharmony_ci READ_ONCE(eb->blocking_writers) == 0); 2778c2ecf20Sopenharmony_ci goto again; 2788c2ecf20Sopenharmony_ci } 2798c2ecf20Sopenharmony_ci btrfs_assert_tree_read_locks_get(eb); 2808c2ecf20Sopenharmony_ci btrfs_assert_spinning_readers_get(eb); 2818c2ecf20Sopenharmony_ci trace_btrfs_tree_read_lock(eb, start_ns); 2828c2ecf20Sopenharmony_ci} 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_civoid btrfs_tree_read_lock(struct extent_buffer *eb) 2858c2ecf20Sopenharmony_ci{ 2868c2ecf20Sopenharmony_ci __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); 2878c2ecf20Sopenharmony_ci} 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ci/* 2908c2ecf20Sopenharmony_ci * Lock extent buffer for read, optimistically expecting that there are no 2918c2ecf20Sopenharmony_ci * contending blocking writers. If there are, don't wait. 2928c2ecf20Sopenharmony_ci * 2938c2ecf20Sopenharmony_ci * Return 1 if the rwlock has been taken, 0 otherwise 2948c2ecf20Sopenharmony_ci */ 2958c2ecf20Sopenharmony_ciint btrfs_tree_read_lock_atomic(struct extent_buffer *eb) 2968c2ecf20Sopenharmony_ci{ 2978c2ecf20Sopenharmony_ci if (READ_ONCE(eb->blocking_writers)) 2988c2ecf20Sopenharmony_ci return 0; 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci read_lock(&eb->lock); 3018c2ecf20Sopenharmony_ci /* Refetch value after lock */ 3028c2ecf20Sopenharmony_ci if (READ_ONCE(eb->blocking_writers)) { 3038c2ecf20Sopenharmony_ci read_unlock(&eb->lock); 3048c2ecf20Sopenharmony_ci return 0; 3058c2ecf20Sopenharmony_ci } 3068c2ecf20Sopenharmony_ci btrfs_assert_tree_read_locks_get(eb); 3078c2ecf20Sopenharmony_ci btrfs_assert_spinning_readers_get(eb); 3088c2ecf20Sopenharmony_ci trace_btrfs_tree_read_lock_atomic(eb); 3098c2ecf20Sopenharmony_ci return 1; 3108c2ecf20Sopenharmony_ci} 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_ci/* 3138c2ecf20Sopenharmony_ci * Try-lock for read. Don't block or wait for contending writers. 3148c2ecf20Sopenharmony_ci * 3158c2ecf20Sopenharmony_ci * Retrun 1 if the rwlock has been taken, 0 otherwise 3168c2ecf20Sopenharmony_ci */ 3178c2ecf20Sopenharmony_ciint btrfs_try_tree_read_lock(struct extent_buffer *eb) 3188c2ecf20Sopenharmony_ci{ 3198c2ecf20Sopenharmony_ci if (READ_ONCE(eb->blocking_writers)) 3208c2ecf20Sopenharmony_ci return 0; 3218c2ecf20Sopenharmony_ci 3228c2ecf20Sopenharmony_ci if (!read_trylock(&eb->lock)) 3238c2ecf20Sopenharmony_ci return 0; 3248c2ecf20Sopenharmony_ci 3258c2ecf20Sopenharmony_ci /* Refetch value after lock */ 3268c2ecf20Sopenharmony_ci if (READ_ONCE(eb->blocking_writers)) { 3278c2ecf20Sopenharmony_ci read_unlock(&eb->lock); 3288c2ecf20Sopenharmony_ci return 0; 3298c2ecf20Sopenharmony_ci } 3308c2ecf20Sopenharmony_ci btrfs_assert_tree_read_locks_get(eb); 3318c2ecf20Sopenharmony_ci btrfs_assert_spinning_readers_get(eb); 3328c2ecf20Sopenharmony_ci trace_btrfs_try_tree_read_lock(eb); 3338c2ecf20Sopenharmony_ci return 1; 3348c2ecf20Sopenharmony_ci} 3358c2ecf20Sopenharmony_ci 3368c2ecf20Sopenharmony_ci/* 3378c2ecf20Sopenharmony_ci * Try-lock for write. May block until the lock is uncontended, but does not 3388c2ecf20Sopenharmony_ci * wait until it is free. 3398c2ecf20Sopenharmony_ci * 3408c2ecf20Sopenharmony_ci * Retrun 1 if the rwlock has been taken, 0 otherwise 3418c2ecf20Sopenharmony_ci */ 3428c2ecf20Sopenharmony_ciint btrfs_try_tree_write_lock(struct extent_buffer *eb) 3438c2ecf20Sopenharmony_ci{ 3448c2ecf20Sopenharmony_ci if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) 3458c2ecf20Sopenharmony_ci return 0; 3468c2ecf20Sopenharmony_ci 3478c2ecf20Sopenharmony_ci write_lock(&eb->lock); 3488c2ecf20Sopenharmony_ci /* Refetch value after lock */ 3498c2ecf20Sopenharmony_ci if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { 3508c2ecf20Sopenharmony_ci write_unlock(&eb->lock); 3518c2ecf20Sopenharmony_ci return 0; 3528c2ecf20Sopenharmony_ci } 3538c2ecf20Sopenharmony_ci btrfs_assert_tree_write_locks_get(eb); 3548c2ecf20Sopenharmony_ci btrfs_assert_spinning_writers_get(eb); 3558c2ecf20Sopenharmony_ci eb->lock_owner = current->pid; 3568c2ecf20Sopenharmony_ci trace_btrfs_try_tree_write_lock(eb); 3578c2ecf20Sopenharmony_ci return 1; 3588c2ecf20Sopenharmony_ci} 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci/* 3618c2ecf20Sopenharmony_ci * Release read lock. Must be used only if the lock is in spinning mode. If 3628c2ecf20Sopenharmony_ci * the read lock is nested, must pair with read lock before the write unlock. 3638c2ecf20Sopenharmony_ci * 3648c2ecf20Sopenharmony_ci * The rwlock is not held upon exit. 3658c2ecf20Sopenharmony_ci */ 3668c2ecf20Sopenharmony_civoid btrfs_tree_read_unlock(struct extent_buffer *eb) 3678c2ecf20Sopenharmony_ci{ 3688c2ecf20Sopenharmony_ci trace_btrfs_tree_read_unlock(eb); 3698c2ecf20Sopenharmony_ci /* 3708c2ecf20Sopenharmony_ci * if we're nested, we have the write lock. No new locking 3718c2ecf20Sopenharmony_ci * is needed as long as we are the lock owner. 3728c2ecf20Sopenharmony_ci * The write unlock will do a barrier for us, and the lock_recursed 3738c2ecf20Sopenharmony_ci * field only matters to the lock owner. 3748c2ecf20Sopenharmony_ci */ 3758c2ecf20Sopenharmony_ci if (eb->lock_recursed && current->pid == eb->lock_owner) { 3768c2ecf20Sopenharmony_ci eb->lock_recursed = false; 3778c2ecf20Sopenharmony_ci return; 3788c2ecf20Sopenharmony_ci } 3798c2ecf20Sopenharmony_ci btrfs_assert_tree_read_locked(eb); 3808c2ecf20Sopenharmony_ci btrfs_assert_spinning_readers_put(eb); 3818c2ecf20Sopenharmony_ci btrfs_assert_tree_read_locks_put(eb); 3828c2ecf20Sopenharmony_ci read_unlock(&eb->lock); 3838c2ecf20Sopenharmony_ci} 3848c2ecf20Sopenharmony_ci 3858c2ecf20Sopenharmony_ci/* 3868c2ecf20Sopenharmony_ci * Release read lock, previously set to blocking by a pairing call to 3878c2ecf20Sopenharmony_ci * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same 3888c2ecf20Sopenharmony_ci * thread. 3898c2ecf20Sopenharmony_ci * 3908c2ecf20Sopenharmony_ci * State of rwlock is unchanged, last reader wakes waiting threads. 3918c2ecf20Sopenharmony_ci */ 3928c2ecf20Sopenharmony_civoid btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) 3938c2ecf20Sopenharmony_ci{ 3948c2ecf20Sopenharmony_ci trace_btrfs_tree_read_unlock_blocking(eb); 3958c2ecf20Sopenharmony_ci /* 3968c2ecf20Sopenharmony_ci * if we're nested, we have the write lock. No new locking 3978c2ecf20Sopenharmony_ci * is needed as long as we are the lock owner. 3988c2ecf20Sopenharmony_ci * The write unlock will do a barrier for us, and the lock_recursed 3998c2ecf20Sopenharmony_ci * field only matters to the lock owner. 4008c2ecf20Sopenharmony_ci */ 4018c2ecf20Sopenharmony_ci if (eb->lock_recursed && current->pid == eb->lock_owner) { 4028c2ecf20Sopenharmony_ci eb->lock_recursed = false; 4038c2ecf20Sopenharmony_ci return; 4048c2ecf20Sopenharmony_ci } 4058c2ecf20Sopenharmony_ci btrfs_assert_tree_read_locked(eb); 4068c2ecf20Sopenharmony_ci WARN_ON(atomic_read(&eb->blocking_readers) == 0); 4078c2ecf20Sopenharmony_ci /* atomic_dec_and_test implies a barrier */ 4088c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&eb->blocking_readers)) 4098c2ecf20Sopenharmony_ci cond_wake_up_nomb(&eb->read_lock_wq); 4108c2ecf20Sopenharmony_ci btrfs_assert_tree_read_locks_put(eb); 4118c2ecf20Sopenharmony_ci} 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_ci/* 4148c2ecf20Sopenharmony_ci * Lock for write. Wait for all blocking and spinning readers and writers. This 4158c2ecf20Sopenharmony_ci * starts context where reader lock could be nested by the same thread. 4168c2ecf20Sopenharmony_ci * 4178c2ecf20Sopenharmony_ci * The rwlock is held for write upon exit. 4188c2ecf20Sopenharmony_ci */ 4198c2ecf20Sopenharmony_civoid __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) 4208c2ecf20Sopenharmony_ci __acquires(&eb->lock) 4218c2ecf20Sopenharmony_ci{ 4228c2ecf20Sopenharmony_ci u64 start_ns = 0; 4238c2ecf20Sopenharmony_ci 4248c2ecf20Sopenharmony_ci if (trace_btrfs_tree_lock_enabled()) 4258c2ecf20Sopenharmony_ci start_ns = ktime_get_ns(); 4268c2ecf20Sopenharmony_ci 4278c2ecf20Sopenharmony_ci WARN_ON(eb->lock_owner == current->pid); 4288c2ecf20Sopenharmony_ciagain: 4298c2ecf20Sopenharmony_ci wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); 4308c2ecf20Sopenharmony_ci wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0); 4318c2ecf20Sopenharmony_ci write_lock(&eb->lock); 4328c2ecf20Sopenharmony_ci /* Refetch value after lock */ 4338c2ecf20Sopenharmony_ci if (atomic_read(&eb->blocking_readers) || 4348c2ecf20Sopenharmony_ci READ_ONCE(eb->blocking_writers)) { 4358c2ecf20Sopenharmony_ci write_unlock(&eb->lock); 4368c2ecf20Sopenharmony_ci goto again; 4378c2ecf20Sopenharmony_ci } 4388c2ecf20Sopenharmony_ci btrfs_assert_spinning_writers_get(eb); 4398c2ecf20Sopenharmony_ci btrfs_assert_tree_write_locks_get(eb); 4408c2ecf20Sopenharmony_ci eb->lock_owner = current->pid; 4418c2ecf20Sopenharmony_ci trace_btrfs_tree_lock(eb, start_ns); 4428c2ecf20Sopenharmony_ci} 4438c2ecf20Sopenharmony_ci 4448c2ecf20Sopenharmony_civoid btrfs_tree_lock(struct extent_buffer *eb) 4458c2ecf20Sopenharmony_ci{ 4468c2ecf20Sopenharmony_ci __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); 4478c2ecf20Sopenharmony_ci} 4488c2ecf20Sopenharmony_ci 4498c2ecf20Sopenharmony_ci/* 4508c2ecf20Sopenharmony_ci * Release the write lock, either blocking or spinning (ie. there's no need 4518c2ecf20Sopenharmony_ci * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). 4528c2ecf20Sopenharmony_ci * This also ends the context for nesting, the read lock must have been 4538c2ecf20Sopenharmony_ci * released already. 4548c2ecf20Sopenharmony_ci * 4558c2ecf20Sopenharmony_ci * Tasks blocked and waiting are woken, rwlock is not held upon exit. 4568c2ecf20Sopenharmony_ci */ 4578c2ecf20Sopenharmony_civoid btrfs_tree_unlock(struct extent_buffer *eb) 4588c2ecf20Sopenharmony_ci{ 4598c2ecf20Sopenharmony_ci /* 4608c2ecf20Sopenharmony_ci * This is read both locked and unlocked but always by the same thread 4618c2ecf20Sopenharmony_ci * that already owns the lock so we don't need to use READ_ONCE 4628c2ecf20Sopenharmony_ci */ 4638c2ecf20Sopenharmony_ci int blockers = eb->blocking_writers; 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_ci BUG_ON(blockers > 1); 4668c2ecf20Sopenharmony_ci 4678c2ecf20Sopenharmony_ci btrfs_assert_tree_locked(eb); 4688c2ecf20Sopenharmony_ci trace_btrfs_tree_unlock(eb); 4698c2ecf20Sopenharmony_ci eb->lock_owner = 0; 4708c2ecf20Sopenharmony_ci btrfs_assert_tree_write_locks_put(eb); 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_ci if (blockers) { 4738c2ecf20Sopenharmony_ci btrfs_assert_no_spinning_writers(eb); 4748c2ecf20Sopenharmony_ci /* Unlocked write */ 4758c2ecf20Sopenharmony_ci WRITE_ONCE(eb->blocking_writers, 0); 4768c2ecf20Sopenharmony_ci /* 4778c2ecf20Sopenharmony_ci * We need to order modifying blocking_writers above with 4788c2ecf20Sopenharmony_ci * actually waking up the sleepers to ensure they see the 4798c2ecf20Sopenharmony_ci * updated value of blocking_writers 4808c2ecf20Sopenharmony_ci */ 4818c2ecf20Sopenharmony_ci cond_wake_up(&eb->write_lock_wq); 4828c2ecf20Sopenharmony_ci } else { 4838c2ecf20Sopenharmony_ci btrfs_assert_spinning_writers_put(eb); 4848c2ecf20Sopenharmony_ci write_unlock(&eb->lock); 4858c2ecf20Sopenharmony_ci } 4868c2ecf20Sopenharmony_ci} 4878c2ecf20Sopenharmony_ci 4888c2ecf20Sopenharmony_ci/* 4898c2ecf20Sopenharmony_ci * Set all locked nodes in the path to blocking locks. This should be done 4908c2ecf20Sopenharmony_ci * before scheduling 4918c2ecf20Sopenharmony_ci */ 4928c2ecf20Sopenharmony_civoid btrfs_set_path_blocking(struct btrfs_path *p) 4938c2ecf20Sopenharmony_ci{ 4948c2ecf20Sopenharmony_ci int i; 4958c2ecf20Sopenharmony_ci 4968c2ecf20Sopenharmony_ci for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4978c2ecf20Sopenharmony_ci if (!p->nodes[i] || !p->locks[i]) 4988c2ecf20Sopenharmony_ci continue; 4998c2ecf20Sopenharmony_ci /* 5008c2ecf20Sopenharmony_ci * If we currently have a spinning reader or writer lock this 5018c2ecf20Sopenharmony_ci * will bump the count of blocking holders and drop the 5028c2ecf20Sopenharmony_ci * spinlock. 5038c2ecf20Sopenharmony_ci */ 5048c2ecf20Sopenharmony_ci if (p->locks[i] == BTRFS_READ_LOCK) { 5058c2ecf20Sopenharmony_ci btrfs_set_lock_blocking_read(p->nodes[i]); 5068c2ecf20Sopenharmony_ci p->locks[i] = BTRFS_READ_LOCK_BLOCKING; 5078c2ecf20Sopenharmony_ci } else if (p->locks[i] == BTRFS_WRITE_LOCK) { 5088c2ecf20Sopenharmony_ci btrfs_set_lock_blocking_write(p->nodes[i]); 5098c2ecf20Sopenharmony_ci p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; 5108c2ecf20Sopenharmony_ci } 5118c2ecf20Sopenharmony_ci } 5128c2ecf20Sopenharmony_ci} 5138c2ecf20Sopenharmony_ci 5148c2ecf20Sopenharmony_ci/* 5158c2ecf20Sopenharmony_ci * This releases any locks held in the path starting at level and going all the 5168c2ecf20Sopenharmony_ci * way up to the root. 5178c2ecf20Sopenharmony_ci * 5188c2ecf20Sopenharmony_ci * btrfs_search_slot will keep the lock held on higher nodes in a few corner 5198c2ecf20Sopenharmony_ci * cases, such as COW of the block at slot zero in the node. This ignores 5208c2ecf20Sopenharmony_ci * those rules, and it should only be called when there are no more updates to 5218c2ecf20Sopenharmony_ci * be done higher up in the tree. 5228c2ecf20Sopenharmony_ci */ 5238c2ecf20Sopenharmony_civoid btrfs_unlock_up_safe(struct btrfs_path *path, int level) 5248c2ecf20Sopenharmony_ci{ 5258c2ecf20Sopenharmony_ci int i; 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_ci if (path->keep_locks) 5288c2ecf20Sopenharmony_ci return; 5298c2ecf20Sopenharmony_ci 5308c2ecf20Sopenharmony_ci for (i = level; i < BTRFS_MAX_LEVEL; i++) { 5318c2ecf20Sopenharmony_ci if (!path->nodes[i]) 5328c2ecf20Sopenharmony_ci continue; 5338c2ecf20Sopenharmony_ci if (!path->locks[i]) 5348c2ecf20Sopenharmony_ci continue; 5358c2ecf20Sopenharmony_ci btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 5368c2ecf20Sopenharmony_ci path->locks[i] = 0; 5378c2ecf20Sopenharmony_ci } 5388c2ecf20Sopenharmony_ci} 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci/* 5418c2ecf20Sopenharmony_ci * Loop around taking references on and locking the root node of the tree until 5428c2ecf20Sopenharmony_ci * we end up with a lock on the root node. 5438c2ecf20Sopenharmony_ci * 5448c2ecf20Sopenharmony_ci * Return: root extent buffer with write lock held 5458c2ecf20Sopenharmony_ci */ 5468c2ecf20Sopenharmony_cistruct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) 5478c2ecf20Sopenharmony_ci{ 5488c2ecf20Sopenharmony_ci struct extent_buffer *eb; 5498c2ecf20Sopenharmony_ci 5508c2ecf20Sopenharmony_ci while (1) { 5518c2ecf20Sopenharmony_ci eb = btrfs_root_node(root); 5528c2ecf20Sopenharmony_ci btrfs_tree_lock(eb); 5538c2ecf20Sopenharmony_ci if (eb == root->node) 5548c2ecf20Sopenharmony_ci break; 5558c2ecf20Sopenharmony_ci btrfs_tree_unlock(eb); 5568c2ecf20Sopenharmony_ci free_extent_buffer(eb); 5578c2ecf20Sopenharmony_ci } 5588c2ecf20Sopenharmony_ci return eb; 5598c2ecf20Sopenharmony_ci} 5608c2ecf20Sopenharmony_ci 5618c2ecf20Sopenharmony_ci/* 5628c2ecf20Sopenharmony_ci * Loop around taking references on and locking the root node of the tree until 5638c2ecf20Sopenharmony_ci * we end up with a lock on the root node. 5648c2ecf20Sopenharmony_ci * 5658c2ecf20Sopenharmony_ci * Return: root extent buffer with read lock held 5668c2ecf20Sopenharmony_ci */ 5678c2ecf20Sopenharmony_cistruct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, 5688c2ecf20Sopenharmony_ci bool recurse) 5698c2ecf20Sopenharmony_ci{ 5708c2ecf20Sopenharmony_ci struct extent_buffer *eb; 5718c2ecf20Sopenharmony_ci 5728c2ecf20Sopenharmony_ci while (1) { 5738c2ecf20Sopenharmony_ci eb = btrfs_root_node(root); 5748c2ecf20Sopenharmony_ci __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse); 5758c2ecf20Sopenharmony_ci if (eb == root->node) 5768c2ecf20Sopenharmony_ci break; 5778c2ecf20Sopenharmony_ci btrfs_tree_read_unlock(eb); 5788c2ecf20Sopenharmony_ci free_extent_buffer(eb); 5798c2ecf20Sopenharmony_ci } 5808c2ecf20Sopenharmony_ci return eb; 5818c2ecf20Sopenharmony_ci} 5828c2ecf20Sopenharmony_ci 5838c2ecf20Sopenharmony_ci/* 5848c2ecf20Sopenharmony_ci * DREW locks 5858c2ecf20Sopenharmony_ci * ========== 5868c2ecf20Sopenharmony_ci * 5878c2ecf20Sopenharmony_ci * DREW stands for double-reader-writer-exclusion lock. It's used in situation 5888c2ecf20Sopenharmony_ci * where you want to provide A-B exclusion but not AA or BB. 5898c2ecf20Sopenharmony_ci * 5908c2ecf20Sopenharmony_ci * Currently implementation gives more priority to reader. If a reader and a 5918c2ecf20Sopenharmony_ci * writer both race to acquire their respective sides of the lock the writer 5928c2ecf20Sopenharmony_ci * would yield its lock as soon as it detects a concurrent reader. Additionally 5938c2ecf20Sopenharmony_ci * if there are pending readers no new writers would be allowed to come in and 5948c2ecf20Sopenharmony_ci * acquire the lock. 5958c2ecf20Sopenharmony_ci */ 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_ciint btrfs_drew_lock_init(struct btrfs_drew_lock *lock) 5988c2ecf20Sopenharmony_ci{ 5998c2ecf20Sopenharmony_ci int ret; 6008c2ecf20Sopenharmony_ci 6018c2ecf20Sopenharmony_ci ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); 6028c2ecf20Sopenharmony_ci if (ret) 6038c2ecf20Sopenharmony_ci return ret; 6048c2ecf20Sopenharmony_ci 6058c2ecf20Sopenharmony_ci atomic_set(&lock->readers, 0); 6068c2ecf20Sopenharmony_ci init_waitqueue_head(&lock->pending_readers); 6078c2ecf20Sopenharmony_ci init_waitqueue_head(&lock->pending_writers); 6088c2ecf20Sopenharmony_ci 6098c2ecf20Sopenharmony_ci return 0; 6108c2ecf20Sopenharmony_ci} 6118c2ecf20Sopenharmony_ci 6128c2ecf20Sopenharmony_civoid btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) 6138c2ecf20Sopenharmony_ci{ 6148c2ecf20Sopenharmony_ci percpu_counter_destroy(&lock->writers); 6158c2ecf20Sopenharmony_ci} 6168c2ecf20Sopenharmony_ci 6178c2ecf20Sopenharmony_ci/* Return true if acquisition is successful, false otherwise */ 6188c2ecf20Sopenharmony_cibool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) 6198c2ecf20Sopenharmony_ci{ 6208c2ecf20Sopenharmony_ci if (atomic_read(&lock->readers)) 6218c2ecf20Sopenharmony_ci return false; 6228c2ecf20Sopenharmony_ci 6238c2ecf20Sopenharmony_ci percpu_counter_inc(&lock->writers); 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_ci /* Ensure writers count is updated before we check for pending readers */ 6268c2ecf20Sopenharmony_ci smp_mb(); 6278c2ecf20Sopenharmony_ci if (atomic_read(&lock->readers)) { 6288c2ecf20Sopenharmony_ci btrfs_drew_write_unlock(lock); 6298c2ecf20Sopenharmony_ci return false; 6308c2ecf20Sopenharmony_ci } 6318c2ecf20Sopenharmony_ci 6328c2ecf20Sopenharmony_ci return true; 6338c2ecf20Sopenharmony_ci} 6348c2ecf20Sopenharmony_ci 6358c2ecf20Sopenharmony_civoid btrfs_drew_write_lock(struct btrfs_drew_lock *lock) 6368c2ecf20Sopenharmony_ci{ 6378c2ecf20Sopenharmony_ci while (true) { 6388c2ecf20Sopenharmony_ci if (btrfs_drew_try_write_lock(lock)) 6398c2ecf20Sopenharmony_ci return; 6408c2ecf20Sopenharmony_ci wait_event(lock->pending_writers, !atomic_read(&lock->readers)); 6418c2ecf20Sopenharmony_ci } 6428c2ecf20Sopenharmony_ci} 6438c2ecf20Sopenharmony_ci 6448c2ecf20Sopenharmony_civoid btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) 6458c2ecf20Sopenharmony_ci{ 6468c2ecf20Sopenharmony_ci percpu_counter_dec(&lock->writers); 6478c2ecf20Sopenharmony_ci cond_wake_up(&lock->pending_readers); 6488c2ecf20Sopenharmony_ci} 6498c2ecf20Sopenharmony_ci 6508c2ecf20Sopenharmony_civoid btrfs_drew_read_lock(struct btrfs_drew_lock *lock) 6518c2ecf20Sopenharmony_ci{ 6528c2ecf20Sopenharmony_ci atomic_inc(&lock->readers); 6538c2ecf20Sopenharmony_ci 6548c2ecf20Sopenharmony_ci /* 6558c2ecf20Sopenharmony_ci * Ensure the pending reader count is perceieved BEFORE this reader 6568c2ecf20Sopenharmony_ci * goes to sleep in case of active writers. This guarantees new writers 6578c2ecf20Sopenharmony_ci * won't be allowed and that the current reader will be woken up when 6588c2ecf20Sopenharmony_ci * the last active writer finishes its jobs. 6598c2ecf20Sopenharmony_ci */ 6608c2ecf20Sopenharmony_ci smp_mb__after_atomic(); 6618c2ecf20Sopenharmony_ci 6628c2ecf20Sopenharmony_ci wait_event(lock->pending_readers, 6638c2ecf20Sopenharmony_ci percpu_counter_sum(&lock->writers) == 0); 6648c2ecf20Sopenharmony_ci} 6658c2ecf20Sopenharmony_ci 6668c2ecf20Sopenharmony_civoid btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) 6678c2ecf20Sopenharmony_ci{ 6688c2ecf20Sopenharmony_ci /* 6698c2ecf20Sopenharmony_ci * atomic_dec_and_test implies a full barrier, so woken up writers 6708c2ecf20Sopenharmony_ci * are guaranteed to see the decrement 6718c2ecf20Sopenharmony_ci */ 6728c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&lock->readers)) 6738c2ecf20Sopenharmony_ci wake_up(&lock->pending_writers); 6748c2ecf20Sopenharmony_ci} 675