162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2007 Oracle. All rights reserved. 462306a36Sopenharmony_ci * Copyright (C) 2014 Fujitsu. All rights reserved. 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci 762306a36Sopenharmony_ci#include <linux/kthread.h> 862306a36Sopenharmony_ci#include <linux/slab.h> 962306a36Sopenharmony_ci#include <linux/list.h> 1062306a36Sopenharmony_ci#include <linux/spinlock.h> 1162306a36Sopenharmony_ci#include <linux/freezer.h> 1262306a36Sopenharmony_ci#include "async-thread.h" 1362306a36Sopenharmony_ci#include "ctree.h" 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_cienum { 1662306a36Sopenharmony_ci WORK_DONE_BIT, 1762306a36Sopenharmony_ci WORK_ORDER_DONE_BIT, 1862306a36Sopenharmony_ci}; 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci#define NO_THRESHOLD (-1) 2162306a36Sopenharmony_ci#define DFT_THRESHOLD (32) 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_cistruct btrfs_workqueue { 2462306a36Sopenharmony_ci struct workqueue_struct *normal_wq; 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci /* File system this workqueue services */ 2762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info; 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci /* List head pointing to ordered work list */ 3062306a36Sopenharmony_ci struct list_head ordered_list; 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_ci /* Spinlock for ordered_list */ 3362306a36Sopenharmony_ci spinlock_t list_lock; 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci /* Thresholding related variants */ 3662306a36Sopenharmony_ci atomic_t pending; 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci /* Up limit of concurrency workers */ 3962306a36Sopenharmony_ci int limit_active; 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci /* Current number of concurrency workers */ 4262306a36Sopenharmony_ci int current_active; 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci /* Threshold to change current_active */ 4562306a36Sopenharmony_ci int thresh; 4662306a36Sopenharmony_ci unsigned int count; 4762306a36Sopenharmony_ci spinlock_t thres_lock; 4862306a36Sopenharmony_ci}; 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_cistruct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq) 5162306a36Sopenharmony_ci{ 5262306a36Sopenharmony_ci return wq->fs_info; 5362306a36Sopenharmony_ci} 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_cistruct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) 5662306a36Sopenharmony_ci{ 5762306a36Sopenharmony_ci return work->wq->fs_info; 5862306a36Sopenharmony_ci} 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_cibool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) 6162306a36Sopenharmony_ci{ 6262306a36Sopenharmony_ci /* 6362306a36Sopenharmony_ci * We could compare wq->pending with num_online_cpus() 6462306a36Sopenharmony_ci * to support "thresh == NO_THRESHOLD" case, but it requires 6562306a36Sopenharmony_ci * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's 6662306a36Sopenharmony_ci * postpone it until someone needs the support of that case. 6762306a36Sopenharmony_ci */ 6862306a36Sopenharmony_ci if (wq->thresh == NO_THRESHOLD) 6962306a36Sopenharmony_ci return false; 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci return atomic_read(&wq->pending) > wq->thresh * 2; 7262306a36Sopenharmony_ci} 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_cistatic void btrfs_init_workqueue(struct btrfs_workqueue *wq, 7562306a36Sopenharmony_ci struct btrfs_fs_info *fs_info) 7662306a36Sopenharmony_ci{ 7762306a36Sopenharmony_ci wq->fs_info = fs_info; 7862306a36Sopenharmony_ci atomic_set(&wq->pending, 0); 7962306a36Sopenharmony_ci INIT_LIST_HEAD(&wq->ordered_list); 8062306a36Sopenharmony_ci spin_lock_init(&wq->list_lock); 8162306a36Sopenharmony_ci spin_lock_init(&wq->thres_lock); 8262306a36Sopenharmony_ci} 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_cistruct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, 8562306a36Sopenharmony_ci const char *name, unsigned int flags, 8662306a36Sopenharmony_ci int limit_active, int thresh) 8762306a36Sopenharmony_ci{ 8862306a36Sopenharmony_ci struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci if (!ret) 9162306a36Sopenharmony_ci return NULL; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci btrfs_init_workqueue(ret, fs_info); 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci ret->limit_active = limit_active; 9662306a36Sopenharmony_ci if (thresh == 0) 9762306a36Sopenharmony_ci thresh = DFT_THRESHOLD; 9862306a36Sopenharmony_ci /* For low threshold, disabling threshold is a better choice */ 9962306a36Sopenharmony_ci if (thresh < DFT_THRESHOLD) { 10062306a36Sopenharmony_ci ret->current_active = limit_active; 10162306a36Sopenharmony_ci ret->thresh = NO_THRESHOLD; 10262306a36Sopenharmony_ci } else { 10362306a36Sopenharmony_ci /* 10462306a36Sopenharmony_ci * For threshold-able wq, let its concurrency grow on demand. 10562306a36Sopenharmony_ci * Use minimal max_active at alloc time to reduce resource 10662306a36Sopenharmony_ci * usage. 10762306a36Sopenharmony_ci */ 10862306a36Sopenharmony_ci ret->current_active = 1; 10962306a36Sopenharmony_ci ret->thresh = thresh; 11062306a36Sopenharmony_ci } 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active, 11362306a36Sopenharmony_ci name); 11462306a36Sopenharmony_ci if (!ret->normal_wq) { 11562306a36Sopenharmony_ci kfree(ret); 11662306a36Sopenharmony_ci return NULL; 11762306a36Sopenharmony_ci } 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci trace_btrfs_workqueue_alloc(ret, name); 12062306a36Sopenharmony_ci return ret; 12162306a36Sopenharmony_ci} 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_cistruct btrfs_workqueue *btrfs_alloc_ordered_workqueue( 12462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info, const char *name, 12562306a36Sopenharmony_ci unsigned int flags) 12662306a36Sopenharmony_ci{ 12762306a36Sopenharmony_ci struct btrfs_workqueue *ret; 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci ret = kzalloc(sizeof(*ret), GFP_KERNEL); 13062306a36Sopenharmony_ci if (!ret) 13162306a36Sopenharmony_ci return NULL; 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci btrfs_init_workqueue(ret, fs_info); 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci /* Ordered workqueues don't allow @max_active adjustments. */ 13662306a36Sopenharmony_ci ret->limit_active = 1; 13762306a36Sopenharmony_ci ret->current_active = 1; 13862306a36Sopenharmony_ci ret->thresh = NO_THRESHOLD; 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name); 14162306a36Sopenharmony_ci if (!ret->normal_wq) { 14262306a36Sopenharmony_ci kfree(ret); 14362306a36Sopenharmony_ci return NULL; 14462306a36Sopenharmony_ci } 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci trace_btrfs_workqueue_alloc(ret, name); 14762306a36Sopenharmony_ci return ret; 14862306a36Sopenharmony_ci} 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_ci/* 15162306a36Sopenharmony_ci * Hook for threshold which will be called in btrfs_queue_work. 15262306a36Sopenharmony_ci * This hook WILL be called in IRQ handler context, 15362306a36Sopenharmony_ci * so workqueue_set_max_active MUST NOT be called in this hook 15462306a36Sopenharmony_ci */ 15562306a36Sopenharmony_cistatic inline void thresh_queue_hook(struct btrfs_workqueue *wq) 15662306a36Sopenharmony_ci{ 15762306a36Sopenharmony_ci if (wq->thresh == NO_THRESHOLD) 15862306a36Sopenharmony_ci return; 15962306a36Sopenharmony_ci atomic_inc(&wq->pending); 16062306a36Sopenharmony_ci} 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci/* 16362306a36Sopenharmony_ci * Hook for threshold which will be called before executing the work, 16462306a36Sopenharmony_ci * This hook is called in kthread content. 16562306a36Sopenharmony_ci * So workqueue_set_max_active is called here. 16662306a36Sopenharmony_ci */ 16762306a36Sopenharmony_cistatic inline void thresh_exec_hook(struct btrfs_workqueue *wq) 16862306a36Sopenharmony_ci{ 16962306a36Sopenharmony_ci int new_current_active; 17062306a36Sopenharmony_ci long pending; 17162306a36Sopenharmony_ci int need_change = 0; 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci if (wq->thresh == NO_THRESHOLD) 17462306a36Sopenharmony_ci return; 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci atomic_dec(&wq->pending); 17762306a36Sopenharmony_ci spin_lock(&wq->thres_lock); 17862306a36Sopenharmony_ci /* 17962306a36Sopenharmony_ci * Use wq->count to limit the calling frequency of 18062306a36Sopenharmony_ci * workqueue_set_max_active. 18162306a36Sopenharmony_ci */ 18262306a36Sopenharmony_ci wq->count++; 18362306a36Sopenharmony_ci wq->count %= (wq->thresh / 4); 18462306a36Sopenharmony_ci if (!wq->count) 18562306a36Sopenharmony_ci goto out; 18662306a36Sopenharmony_ci new_current_active = wq->current_active; 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci /* 18962306a36Sopenharmony_ci * pending may be changed later, but it's OK since we really 19062306a36Sopenharmony_ci * don't need it so accurate to calculate new_max_active. 19162306a36Sopenharmony_ci */ 19262306a36Sopenharmony_ci pending = atomic_read(&wq->pending); 19362306a36Sopenharmony_ci if (pending > wq->thresh) 19462306a36Sopenharmony_ci new_current_active++; 19562306a36Sopenharmony_ci if (pending < wq->thresh / 2) 19662306a36Sopenharmony_ci new_current_active--; 19762306a36Sopenharmony_ci new_current_active = clamp_val(new_current_active, 1, wq->limit_active); 19862306a36Sopenharmony_ci if (new_current_active != wq->current_active) { 19962306a36Sopenharmony_ci need_change = 1; 20062306a36Sopenharmony_ci wq->current_active = new_current_active; 20162306a36Sopenharmony_ci } 20262306a36Sopenharmony_ciout: 20362306a36Sopenharmony_ci spin_unlock(&wq->thres_lock); 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci if (need_change) { 20662306a36Sopenharmony_ci workqueue_set_max_active(wq->normal_wq, wq->current_active); 20762306a36Sopenharmony_ci } 20862306a36Sopenharmony_ci} 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_cistatic void run_ordered_work(struct btrfs_workqueue *wq, 21162306a36Sopenharmony_ci struct btrfs_work *self) 21262306a36Sopenharmony_ci{ 21362306a36Sopenharmony_ci struct list_head *list = &wq->ordered_list; 21462306a36Sopenharmony_ci struct btrfs_work *work; 21562306a36Sopenharmony_ci spinlock_t *lock = &wq->list_lock; 21662306a36Sopenharmony_ci unsigned long flags; 21762306a36Sopenharmony_ci bool free_self = false; 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci while (1) { 22062306a36Sopenharmony_ci spin_lock_irqsave(lock, flags); 22162306a36Sopenharmony_ci if (list_empty(list)) 22262306a36Sopenharmony_ci break; 22362306a36Sopenharmony_ci work = list_entry(list->next, struct btrfs_work, 22462306a36Sopenharmony_ci ordered_list); 22562306a36Sopenharmony_ci if (!test_bit(WORK_DONE_BIT, &work->flags)) 22662306a36Sopenharmony_ci break; 22762306a36Sopenharmony_ci /* 22862306a36Sopenharmony_ci * Orders all subsequent loads after reading WORK_DONE_BIT, 22962306a36Sopenharmony_ci * paired with the smp_mb__before_atomic in btrfs_work_helper 23062306a36Sopenharmony_ci * this guarantees that the ordered function will see all 23162306a36Sopenharmony_ci * updates from ordinary work function. 23262306a36Sopenharmony_ci */ 23362306a36Sopenharmony_ci smp_rmb(); 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci /* 23662306a36Sopenharmony_ci * we are going to call the ordered done function, but 23762306a36Sopenharmony_ci * we leave the work item on the list as a barrier so 23862306a36Sopenharmony_ci * that later work items that are done don't have their 23962306a36Sopenharmony_ci * functions called before this one returns 24062306a36Sopenharmony_ci */ 24162306a36Sopenharmony_ci if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) 24262306a36Sopenharmony_ci break; 24362306a36Sopenharmony_ci trace_btrfs_ordered_sched(work); 24462306a36Sopenharmony_ci spin_unlock_irqrestore(lock, flags); 24562306a36Sopenharmony_ci work->ordered_func(work); 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci /* now take the lock again and drop our item from the list */ 24862306a36Sopenharmony_ci spin_lock_irqsave(lock, flags); 24962306a36Sopenharmony_ci list_del(&work->ordered_list); 25062306a36Sopenharmony_ci spin_unlock_irqrestore(lock, flags); 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci if (work == self) { 25362306a36Sopenharmony_ci /* 25462306a36Sopenharmony_ci * This is the work item that the worker is currently 25562306a36Sopenharmony_ci * executing. 25662306a36Sopenharmony_ci * 25762306a36Sopenharmony_ci * The kernel workqueue code guarantees non-reentrancy 25862306a36Sopenharmony_ci * of work items. I.e., if a work item with the same 25962306a36Sopenharmony_ci * address and work function is queued twice, the second 26062306a36Sopenharmony_ci * execution is blocked until the first one finishes. A 26162306a36Sopenharmony_ci * work item may be freed and recycled with the same 26262306a36Sopenharmony_ci * work function; the workqueue code assumes that the 26362306a36Sopenharmony_ci * original work item cannot depend on the recycled work 26462306a36Sopenharmony_ci * item in that case (see find_worker_executing_work()). 26562306a36Sopenharmony_ci * 26662306a36Sopenharmony_ci * Note that different types of Btrfs work can depend on 26762306a36Sopenharmony_ci * each other, and one type of work on one Btrfs 26862306a36Sopenharmony_ci * filesystem may even depend on the same type of work 26962306a36Sopenharmony_ci * on another Btrfs filesystem via, e.g., a loop device. 27062306a36Sopenharmony_ci * Therefore, we must not allow the current work item to 27162306a36Sopenharmony_ci * be recycled until we are really done, otherwise we 27262306a36Sopenharmony_ci * break the above assumption and can deadlock. 27362306a36Sopenharmony_ci */ 27462306a36Sopenharmony_ci free_self = true; 27562306a36Sopenharmony_ci } else { 27662306a36Sopenharmony_ci /* 27762306a36Sopenharmony_ci * We don't want to call the ordered free functions with 27862306a36Sopenharmony_ci * the lock held. 27962306a36Sopenharmony_ci */ 28062306a36Sopenharmony_ci work->ordered_free(work); 28162306a36Sopenharmony_ci /* NB: work must not be dereferenced past this point. */ 28262306a36Sopenharmony_ci trace_btrfs_all_work_done(wq->fs_info, work); 28362306a36Sopenharmony_ci } 28462306a36Sopenharmony_ci } 28562306a36Sopenharmony_ci spin_unlock_irqrestore(lock, flags); 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci if (free_self) { 28862306a36Sopenharmony_ci self->ordered_free(self); 28962306a36Sopenharmony_ci /* NB: self must not be dereferenced past this point. */ 29062306a36Sopenharmony_ci trace_btrfs_all_work_done(wq->fs_info, self); 29162306a36Sopenharmony_ci } 29262306a36Sopenharmony_ci} 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_cistatic void btrfs_work_helper(struct work_struct *normal_work) 29562306a36Sopenharmony_ci{ 29662306a36Sopenharmony_ci struct btrfs_work *work = container_of(normal_work, struct btrfs_work, 29762306a36Sopenharmony_ci normal_work); 29862306a36Sopenharmony_ci struct btrfs_workqueue *wq = work->wq; 29962306a36Sopenharmony_ci int need_order = 0; 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci /* 30262306a36Sopenharmony_ci * We should not touch things inside work in the following cases: 30362306a36Sopenharmony_ci * 1) after work->func() if it has no ordered_free 30462306a36Sopenharmony_ci * Since the struct is freed in work->func(). 30562306a36Sopenharmony_ci * 2) after setting WORK_DONE_BIT 30662306a36Sopenharmony_ci * The work may be freed in other threads almost instantly. 30762306a36Sopenharmony_ci * So we save the needed things here. 30862306a36Sopenharmony_ci */ 30962306a36Sopenharmony_ci if (work->ordered_func) 31062306a36Sopenharmony_ci need_order = 1; 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_ci trace_btrfs_work_sched(work); 31362306a36Sopenharmony_ci thresh_exec_hook(wq); 31462306a36Sopenharmony_ci work->func(work); 31562306a36Sopenharmony_ci if (need_order) { 31662306a36Sopenharmony_ci /* 31762306a36Sopenharmony_ci * Ensures all memory accesses done in the work function are 31862306a36Sopenharmony_ci * ordered before setting the WORK_DONE_BIT. Ensuring the thread 31962306a36Sopenharmony_ci * which is going to executed the ordered work sees them. 32062306a36Sopenharmony_ci * Pairs with the smp_rmb in run_ordered_work. 32162306a36Sopenharmony_ci */ 32262306a36Sopenharmony_ci smp_mb__before_atomic(); 32362306a36Sopenharmony_ci set_bit(WORK_DONE_BIT, &work->flags); 32462306a36Sopenharmony_ci run_ordered_work(wq, work); 32562306a36Sopenharmony_ci } else { 32662306a36Sopenharmony_ci /* NB: work must not be dereferenced past this point. */ 32762306a36Sopenharmony_ci trace_btrfs_all_work_done(wq->fs_info, work); 32862306a36Sopenharmony_ci } 32962306a36Sopenharmony_ci} 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_civoid btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, 33262306a36Sopenharmony_ci btrfs_func_t ordered_func, btrfs_func_t ordered_free) 33362306a36Sopenharmony_ci{ 33462306a36Sopenharmony_ci work->func = func; 33562306a36Sopenharmony_ci work->ordered_func = ordered_func; 33662306a36Sopenharmony_ci work->ordered_free = ordered_free; 33762306a36Sopenharmony_ci INIT_WORK(&work->normal_work, btrfs_work_helper); 33862306a36Sopenharmony_ci INIT_LIST_HEAD(&work->ordered_list); 33962306a36Sopenharmony_ci work->flags = 0; 34062306a36Sopenharmony_ci} 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_civoid btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work) 34362306a36Sopenharmony_ci{ 34462306a36Sopenharmony_ci unsigned long flags; 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_ci work->wq = wq; 34762306a36Sopenharmony_ci thresh_queue_hook(wq); 34862306a36Sopenharmony_ci if (work->ordered_func) { 34962306a36Sopenharmony_ci spin_lock_irqsave(&wq->list_lock, flags); 35062306a36Sopenharmony_ci list_add_tail(&work->ordered_list, &wq->ordered_list); 35162306a36Sopenharmony_ci spin_unlock_irqrestore(&wq->list_lock, flags); 35262306a36Sopenharmony_ci } 35362306a36Sopenharmony_ci trace_btrfs_work_queued(work); 35462306a36Sopenharmony_ci queue_work(wq->normal_wq, &work->normal_work); 35562306a36Sopenharmony_ci} 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_civoid btrfs_destroy_workqueue(struct btrfs_workqueue *wq) 35862306a36Sopenharmony_ci{ 35962306a36Sopenharmony_ci if (!wq) 36062306a36Sopenharmony_ci return; 36162306a36Sopenharmony_ci destroy_workqueue(wq->normal_wq); 36262306a36Sopenharmony_ci trace_btrfs_workqueue_destroy(wq); 36362306a36Sopenharmony_ci kfree(wq); 36462306a36Sopenharmony_ci} 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_civoid btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) 36762306a36Sopenharmony_ci{ 36862306a36Sopenharmony_ci if (wq) 36962306a36Sopenharmony_ci wq->limit_active = limit_active; 37062306a36Sopenharmony_ci} 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_civoid btrfs_flush_workqueue(struct btrfs_workqueue *wq) 37362306a36Sopenharmony_ci{ 37462306a36Sopenharmony_ci flush_workqueue(wq->normal_wq); 37562306a36Sopenharmony_ci} 376