162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2009-2011 Red Hat, Inc.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Author: Mikulas Patocka <mpatocka@redhat.com>
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * This file is released under the GPL.
862306a36Sopenharmony_ci */
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#include <linux/dm-bufio.h>
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci#include <linux/device-mapper.h>
1362306a36Sopenharmony_ci#include <linux/dm-io.h>
1462306a36Sopenharmony_ci#include <linux/slab.h>
1562306a36Sopenharmony_ci#include <linux/sched/mm.h>
1662306a36Sopenharmony_ci#include <linux/jiffies.h>
1762306a36Sopenharmony_ci#include <linux/vmalloc.h>
1862306a36Sopenharmony_ci#include <linux/shrinker.h>
1962306a36Sopenharmony_ci#include <linux/module.h>
2062306a36Sopenharmony_ci#include <linux/rbtree.h>
2162306a36Sopenharmony_ci#include <linux/stacktrace.h>
2262306a36Sopenharmony_ci#include <linux/jump_label.h>
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci#include "dm.h"
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci#define DM_MSG_PREFIX "bufio"
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci/*
2962306a36Sopenharmony_ci * Memory management policy:
3062306a36Sopenharmony_ci *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
3162306a36Sopenharmony_ci *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
3262306a36Sopenharmony_ci *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
3362306a36Sopenharmony_ci *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
3462306a36Sopenharmony_ci *	dirty buffers.
3562306a36Sopenharmony_ci */
3662306a36Sopenharmony_ci#define DM_BUFIO_MIN_BUFFERS		8
3762306a36Sopenharmony_ci
3862306a36Sopenharmony_ci#define DM_BUFIO_MEMORY_PERCENT		2
3962306a36Sopenharmony_ci#define DM_BUFIO_VMALLOC_PERCENT	25
4062306a36Sopenharmony_ci#define DM_BUFIO_WRITEBACK_RATIO	3
4162306a36Sopenharmony_ci#define DM_BUFIO_LOW_WATERMARK_RATIO	16
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ci/*
4462306a36Sopenharmony_ci * Check buffer ages in this interval (seconds)
4562306a36Sopenharmony_ci */
4662306a36Sopenharmony_ci#define DM_BUFIO_WORK_TIMER_SECS	30
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci/*
4962306a36Sopenharmony_ci * Free buffers when they are older than this (seconds)
5062306a36Sopenharmony_ci */
5162306a36Sopenharmony_ci#define DM_BUFIO_DEFAULT_AGE_SECS	300
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci/*
5462306a36Sopenharmony_ci * The nr of bytes of cached data to keep around.
5562306a36Sopenharmony_ci */
5662306a36Sopenharmony_ci#define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci/*
5962306a36Sopenharmony_ci * Align buffer writes to this boundary.
6062306a36Sopenharmony_ci * Tests show that SSDs have the highest IOPS when using 4k writes.
6162306a36Sopenharmony_ci */
6262306a36Sopenharmony_ci#define DM_BUFIO_WRITE_ALIGN		4096
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci/*
6562306a36Sopenharmony_ci * dm_buffer->list_mode
6662306a36Sopenharmony_ci */
6762306a36Sopenharmony_ci#define LIST_CLEAN	0
6862306a36Sopenharmony_ci#define LIST_DIRTY	1
6962306a36Sopenharmony_ci#define LIST_SIZE	2
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci/*--------------------------------------------------------------*/
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci/*
7462306a36Sopenharmony_ci * Rather than use an LRU list, we use a clock algorithm where entries
7562306a36Sopenharmony_ci * are held in a circular list.  When an entry is 'hit' a reference bit
7662306a36Sopenharmony_ci * is set.  The least recently used entry is approximated by running a
7762306a36Sopenharmony_ci * cursor around the list selecting unreferenced entries. Referenced
7862306a36Sopenharmony_ci * entries have their reference bit cleared as the cursor passes them.
7962306a36Sopenharmony_ci */
8062306a36Sopenharmony_cistruct lru_entry {
8162306a36Sopenharmony_ci	struct list_head list;
8262306a36Sopenharmony_ci	atomic_t referenced;
8362306a36Sopenharmony_ci};
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_cistruct lru_iter {
8662306a36Sopenharmony_ci	struct lru *lru;
8762306a36Sopenharmony_ci	struct list_head list;
8862306a36Sopenharmony_ci	struct lru_entry *stop;
8962306a36Sopenharmony_ci	struct lru_entry *e;
9062306a36Sopenharmony_ci};
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_cistruct lru {
9362306a36Sopenharmony_ci	struct list_head *cursor;
9462306a36Sopenharmony_ci	unsigned long count;
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	struct list_head iterators;
9762306a36Sopenharmony_ci};
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci/*--------------*/
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_cistatic void lru_init(struct lru *lru)
10262306a36Sopenharmony_ci{
10362306a36Sopenharmony_ci	lru->cursor = NULL;
10462306a36Sopenharmony_ci	lru->count = 0;
10562306a36Sopenharmony_ci	INIT_LIST_HEAD(&lru->iterators);
10662306a36Sopenharmony_ci}
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_cistatic void lru_destroy(struct lru *lru)
10962306a36Sopenharmony_ci{
11062306a36Sopenharmony_ci	WARN_ON_ONCE(lru->cursor);
11162306a36Sopenharmony_ci	WARN_ON_ONCE(!list_empty(&lru->iterators));
11262306a36Sopenharmony_ci}
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci/*
11562306a36Sopenharmony_ci * Insert a new entry into the lru.
11662306a36Sopenharmony_ci */
11762306a36Sopenharmony_cistatic void lru_insert(struct lru *lru, struct lru_entry *le)
11862306a36Sopenharmony_ci{
11962306a36Sopenharmony_ci	/*
12062306a36Sopenharmony_ci	 * Don't be tempted to set to 1, makes the lru aspect
12162306a36Sopenharmony_ci	 * perform poorly.
12262306a36Sopenharmony_ci	 */
12362306a36Sopenharmony_ci	atomic_set(&le->referenced, 0);
12462306a36Sopenharmony_ci
12562306a36Sopenharmony_ci	if (lru->cursor) {
12662306a36Sopenharmony_ci		list_add_tail(&le->list, lru->cursor);
12762306a36Sopenharmony_ci	} else {
12862306a36Sopenharmony_ci		INIT_LIST_HEAD(&le->list);
12962306a36Sopenharmony_ci		lru->cursor = &le->list;
13062306a36Sopenharmony_ci	}
13162306a36Sopenharmony_ci	lru->count++;
13262306a36Sopenharmony_ci}
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci/*--------------*/
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci/*
13762306a36Sopenharmony_ci * Convert a list_head pointer to an lru_entry pointer.
13862306a36Sopenharmony_ci */
13962306a36Sopenharmony_cistatic inline struct lru_entry *to_le(struct list_head *l)
14062306a36Sopenharmony_ci{
14162306a36Sopenharmony_ci	return container_of(l, struct lru_entry, list);
14262306a36Sopenharmony_ci}
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci/*
14562306a36Sopenharmony_ci * Initialize an lru_iter and add it to the list of cursors in the lru.
14662306a36Sopenharmony_ci */
14762306a36Sopenharmony_cistatic void lru_iter_begin(struct lru *lru, struct lru_iter *it)
14862306a36Sopenharmony_ci{
14962306a36Sopenharmony_ci	it->lru = lru;
15062306a36Sopenharmony_ci	it->stop = lru->cursor ? to_le(lru->cursor->prev) : NULL;
15162306a36Sopenharmony_ci	it->e = lru->cursor ? to_le(lru->cursor) : NULL;
15262306a36Sopenharmony_ci	list_add(&it->list, &lru->iterators);
15362306a36Sopenharmony_ci}
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci/*
15662306a36Sopenharmony_ci * Remove an lru_iter from the list of cursors in the lru.
15762306a36Sopenharmony_ci */
15862306a36Sopenharmony_cistatic inline void lru_iter_end(struct lru_iter *it)
15962306a36Sopenharmony_ci{
16062306a36Sopenharmony_ci	list_del(&it->list);
16162306a36Sopenharmony_ci}
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci/* Predicate function type to be used with lru_iter_next */
16462306a36Sopenharmony_citypedef bool (*iter_predicate)(struct lru_entry *le, void *context);
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci/*
16762306a36Sopenharmony_ci * Advance the cursor to the next entry that passes the
16862306a36Sopenharmony_ci * predicate, and return that entry.  Returns NULL if the
16962306a36Sopenharmony_ci * iteration is complete.
17062306a36Sopenharmony_ci */
17162306a36Sopenharmony_cistatic struct lru_entry *lru_iter_next(struct lru_iter *it,
17262306a36Sopenharmony_ci				       iter_predicate pred, void *context)
17362306a36Sopenharmony_ci{
17462306a36Sopenharmony_ci	struct lru_entry *e;
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci	while (it->e) {
17762306a36Sopenharmony_ci		e = it->e;
17862306a36Sopenharmony_ci
17962306a36Sopenharmony_ci		/* advance the cursor */
18062306a36Sopenharmony_ci		if (it->e == it->stop)
18162306a36Sopenharmony_ci			it->e = NULL;
18262306a36Sopenharmony_ci		else
18362306a36Sopenharmony_ci			it->e = to_le(it->e->list.next);
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci		if (pred(e, context))
18662306a36Sopenharmony_ci			return e;
18762306a36Sopenharmony_ci	}
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	return NULL;
19062306a36Sopenharmony_ci}
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci/*
19362306a36Sopenharmony_ci * Invalidate a specific lru_entry and update all cursors in
19462306a36Sopenharmony_ci * the lru accordingly.
19562306a36Sopenharmony_ci */
19662306a36Sopenharmony_cistatic void lru_iter_invalidate(struct lru *lru, struct lru_entry *e)
19762306a36Sopenharmony_ci{
19862306a36Sopenharmony_ci	struct lru_iter *it;
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci	list_for_each_entry(it, &lru->iterators, list) {
20162306a36Sopenharmony_ci		/* Move c->e forwards if necc. */
20262306a36Sopenharmony_ci		if (it->e == e) {
20362306a36Sopenharmony_ci			it->e = to_le(it->e->list.next);
20462306a36Sopenharmony_ci			if (it->e == e)
20562306a36Sopenharmony_ci				it->e = NULL;
20662306a36Sopenharmony_ci		}
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci		/* Move it->stop backwards if necc. */
20962306a36Sopenharmony_ci		if (it->stop == e) {
21062306a36Sopenharmony_ci			it->stop = to_le(it->stop->list.prev);
21162306a36Sopenharmony_ci			if (it->stop == e)
21262306a36Sopenharmony_ci				it->stop = NULL;
21362306a36Sopenharmony_ci		}
21462306a36Sopenharmony_ci	}
21562306a36Sopenharmony_ci}
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci/*--------------*/
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci/*
22062306a36Sopenharmony_ci * Remove a specific entry from the lru.
22162306a36Sopenharmony_ci */
22262306a36Sopenharmony_cistatic void lru_remove(struct lru *lru, struct lru_entry *le)
22362306a36Sopenharmony_ci{
22462306a36Sopenharmony_ci	lru_iter_invalidate(lru, le);
22562306a36Sopenharmony_ci	if (lru->count == 1) {
22662306a36Sopenharmony_ci		lru->cursor = NULL;
22762306a36Sopenharmony_ci	} else {
22862306a36Sopenharmony_ci		if (lru->cursor == &le->list)
22962306a36Sopenharmony_ci			lru->cursor = lru->cursor->next;
23062306a36Sopenharmony_ci		list_del(&le->list);
23162306a36Sopenharmony_ci	}
23262306a36Sopenharmony_ci	lru->count--;
23362306a36Sopenharmony_ci}
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ci/*
23662306a36Sopenharmony_ci * Mark as referenced.
23762306a36Sopenharmony_ci */
23862306a36Sopenharmony_cistatic inline void lru_reference(struct lru_entry *le)
23962306a36Sopenharmony_ci{
24062306a36Sopenharmony_ci	atomic_set(&le->referenced, 1);
24162306a36Sopenharmony_ci}
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_ci/*--------------*/
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci/*
24662306a36Sopenharmony_ci * Remove the least recently used entry (approx), that passes the predicate.
24762306a36Sopenharmony_ci * Returns NULL on failure.
24862306a36Sopenharmony_ci */
24962306a36Sopenharmony_cienum evict_result {
25062306a36Sopenharmony_ci	ER_EVICT,
25162306a36Sopenharmony_ci	ER_DONT_EVICT,
25262306a36Sopenharmony_ci	ER_STOP, /* stop looking for something to evict */
25362306a36Sopenharmony_ci};
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_citypedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_cistatic struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context, bool no_sleep)
25862306a36Sopenharmony_ci{
25962306a36Sopenharmony_ci	unsigned long tested = 0;
26062306a36Sopenharmony_ci	struct list_head *h = lru->cursor;
26162306a36Sopenharmony_ci	struct lru_entry *le;
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci	if (!h)
26462306a36Sopenharmony_ci		return NULL;
26562306a36Sopenharmony_ci	/*
26662306a36Sopenharmony_ci	 * In the worst case we have to loop around twice. Once to clear
26762306a36Sopenharmony_ci	 * the reference flags, and then again to discover the predicate
26862306a36Sopenharmony_ci	 * fails for all entries.
26962306a36Sopenharmony_ci	 */
27062306a36Sopenharmony_ci	while (tested < lru->count) {
27162306a36Sopenharmony_ci		le = container_of(h, struct lru_entry, list);
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci		if (atomic_read(&le->referenced)) {
27462306a36Sopenharmony_ci			atomic_set(&le->referenced, 0);
27562306a36Sopenharmony_ci		} else {
27662306a36Sopenharmony_ci			tested++;
27762306a36Sopenharmony_ci			switch (pred(le, context)) {
27862306a36Sopenharmony_ci			case ER_EVICT:
27962306a36Sopenharmony_ci				/*
28062306a36Sopenharmony_ci				 * Adjust the cursor, so we start the next
28162306a36Sopenharmony_ci				 * search from here.
28262306a36Sopenharmony_ci				 */
28362306a36Sopenharmony_ci				lru->cursor = le->list.next;
28462306a36Sopenharmony_ci				lru_remove(lru, le);
28562306a36Sopenharmony_ci				return le;
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci			case ER_DONT_EVICT:
28862306a36Sopenharmony_ci				break;
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci			case ER_STOP:
29162306a36Sopenharmony_ci				lru->cursor = le->list.next;
29262306a36Sopenharmony_ci				return NULL;
29362306a36Sopenharmony_ci			}
29462306a36Sopenharmony_ci		}
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci		h = h->next;
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci		if (!no_sleep)
29962306a36Sopenharmony_ci			cond_resched();
30062306a36Sopenharmony_ci	}
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	return NULL;
30362306a36Sopenharmony_ci}
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci/*--------------------------------------------------------------*/
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci/*
30862306a36Sopenharmony_ci * Buffer state bits.
30962306a36Sopenharmony_ci */
31062306a36Sopenharmony_ci#define B_READING	0
31162306a36Sopenharmony_ci#define B_WRITING	1
31262306a36Sopenharmony_ci#define B_DIRTY		2
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci/*
31562306a36Sopenharmony_ci * Describes how the block was allocated:
31662306a36Sopenharmony_ci * kmem_cache_alloc(), __get_free_pages() or vmalloc().
31762306a36Sopenharmony_ci * See the comment at alloc_buffer_data.
31862306a36Sopenharmony_ci */
31962306a36Sopenharmony_cienum data_mode {
32062306a36Sopenharmony_ci	DATA_MODE_SLAB = 0,
32162306a36Sopenharmony_ci	DATA_MODE_GET_FREE_PAGES = 1,
32262306a36Sopenharmony_ci	DATA_MODE_VMALLOC = 2,
32362306a36Sopenharmony_ci	DATA_MODE_LIMIT = 3
32462306a36Sopenharmony_ci};
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_cistruct dm_buffer {
32762306a36Sopenharmony_ci	/* protected by the locks in dm_buffer_cache */
32862306a36Sopenharmony_ci	struct rb_node node;
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci	/* immutable, so don't need protecting */
33162306a36Sopenharmony_ci	sector_t block;
33262306a36Sopenharmony_ci	void *data;
33362306a36Sopenharmony_ci	unsigned char data_mode;		/* DATA_MODE_* */
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci	/*
33662306a36Sopenharmony_ci	 * These two fields are used in isolation, so do not need
33762306a36Sopenharmony_ci	 * a surrounding lock.
33862306a36Sopenharmony_ci	 */
33962306a36Sopenharmony_ci	atomic_t hold_count;
34062306a36Sopenharmony_ci	unsigned long last_accessed;
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	/*
34362306a36Sopenharmony_ci	 * Everything else is protected by the mutex in
34462306a36Sopenharmony_ci	 * dm_bufio_client
34562306a36Sopenharmony_ci	 */
34662306a36Sopenharmony_ci	unsigned long state;
34762306a36Sopenharmony_ci	struct lru_entry lru;
34862306a36Sopenharmony_ci	unsigned char list_mode;		/* LIST_* */
34962306a36Sopenharmony_ci	blk_status_t read_error;
35062306a36Sopenharmony_ci	blk_status_t write_error;
35162306a36Sopenharmony_ci	unsigned int dirty_start;
35262306a36Sopenharmony_ci	unsigned int dirty_end;
35362306a36Sopenharmony_ci	unsigned int write_start;
35462306a36Sopenharmony_ci	unsigned int write_end;
35562306a36Sopenharmony_ci	struct list_head write_list;
35662306a36Sopenharmony_ci	struct dm_bufio_client *c;
35762306a36Sopenharmony_ci	void (*end_io)(struct dm_buffer *b, blk_status_t bs);
35862306a36Sopenharmony_ci#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
35962306a36Sopenharmony_ci#define MAX_STACK 10
36062306a36Sopenharmony_ci	unsigned int stack_len;
36162306a36Sopenharmony_ci	unsigned long stack_entries[MAX_STACK];
36262306a36Sopenharmony_ci#endif
36362306a36Sopenharmony_ci};
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci/*--------------------------------------------------------------*/
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci/*
36862306a36Sopenharmony_ci * The buffer cache manages buffers, particularly:
36962306a36Sopenharmony_ci *  - inc/dec of holder count
37062306a36Sopenharmony_ci *  - setting the last_accessed field
37162306a36Sopenharmony_ci *  - maintains clean/dirty state along with lru
37262306a36Sopenharmony_ci *  - selecting buffers that match predicates
37362306a36Sopenharmony_ci *
37462306a36Sopenharmony_ci * It does *not* handle:
37562306a36Sopenharmony_ci *  - allocation/freeing of buffers.
37662306a36Sopenharmony_ci *  - IO
37762306a36Sopenharmony_ci *  - Eviction or cache sizing.
37862306a36Sopenharmony_ci *
37962306a36Sopenharmony_ci * cache_get() and cache_put() are threadsafe, you do not need to
38062306a36Sopenharmony_ci * protect these calls with a surrounding mutex.  All the other
38162306a36Sopenharmony_ci * methods are not threadsafe; they do use locking primitives, but
38262306a36Sopenharmony_ci * only enough to ensure get/put are threadsafe.
38362306a36Sopenharmony_ci */
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_cistruct buffer_tree {
38662306a36Sopenharmony_ci	union {
38762306a36Sopenharmony_ci		struct rw_semaphore lock;
38862306a36Sopenharmony_ci		rwlock_t spinlock;
38962306a36Sopenharmony_ci	} u;
39062306a36Sopenharmony_ci	struct rb_root root;
39162306a36Sopenharmony_ci} ____cacheline_aligned_in_smp;
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_cistruct dm_buffer_cache {
39462306a36Sopenharmony_ci	struct lru lru[LIST_SIZE];
39562306a36Sopenharmony_ci	/*
39662306a36Sopenharmony_ci	 * We spread entries across multiple trees to reduce contention
39762306a36Sopenharmony_ci	 * on the locks.
39862306a36Sopenharmony_ci	 */
39962306a36Sopenharmony_ci	unsigned int num_locks;
40062306a36Sopenharmony_ci	bool no_sleep;
40162306a36Sopenharmony_ci	struct buffer_tree trees[];
40262306a36Sopenharmony_ci};
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_cistatic DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_cistatic inline unsigned int cache_index(sector_t block, unsigned int num_locks)
40762306a36Sopenharmony_ci{
40862306a36Sopenharmony_ci	return dm_hash_locks_index(block, num_locks);
40962306a36Sopenharmony_ci}
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_cistatic inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
41262306a36Sopenharmony_ci{
41362306a36Sopenharmony_ci	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
41462306a36Sopenharmony_ci		read_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
41562306a36Sopenharmony_ci	else
41662306a36Sopenharmony_ci		down_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
41762306a36Sopenharmony_ci}
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_cistatic inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
42062306a36Sopenharmony_ci{
42162306a36Sopenharmony_ci	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
42262306a36Sopenharmony_ci		read_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
42362306a36Sopenharmony_ci	else
42462306a36Sopenharmony_ci		up_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
42562306a36Sopenharmony_ci}
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_cistatic inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
42862306a36Sopenharmony_ci{
42962306a36Sopenharmony_ci	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
43062306a36Sopenharmony_ci		write_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
43162306a36Sopenharmony_ci	else
43262306a36Sopenharmony_ci		down_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
43362306a36Sopenharmony_ci}
43462306a36Sopenharmony_ci
43562306a36Sopenharmony_cistatic inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
43662306a36Sopenharmony_ci{
43762306a36Sopenharmony_ci	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
43862306a36Sopenharmony_ci		write_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
43962306a36Sopenharmony_ci	else
44062306a36Sopenharmony_ci		up_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
44162306a36Sopenharmony_ci}
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci/*
44462306a36Sopenharmony_ci * Sometimes we want to repeatedly get and drop locks as part of an iteration.
44562306a36Sopenharmony_ci * This struct helps avoid redundant drop and gets of the same lock.
44662306a36Sopenharmony_ci */
44762306a36Sopenharmony_cistruct lock_history {
44862306a36Sopenharmony_ci	struct dm_buffer_cache *cache;
44962306a36Sopenharmony_ci	bool write;
45062306a36Sopenharmony_ci	unsigned int previous;
45162306a36Sopenharmony_ci	unsigned int no_previous;
45262306a36Sopenharmony_ci};
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_cistatic void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write)
45562306a36Sopenharmony_ci{
45662306a36Sopenharmony_ci	lh->cache = cache;
45762306a36Sopenharmony_ci	lh->write = write;
45862306a36Sopenharmony_ci	lh->no_previous = cache->num_locks;
45962306a36Sopenharmony_ci	lh->previous = lh->no_previous;
46062306a36Sopenharmony_ci}
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_cistatic void __lh_lock(struct lock_history *lh, unsigned int index)
46362306a36Sopenharmony_ci{
46462306a36Sopenharmony_ci	if (lh->write) {
46562306a36Sopenharmony_ci		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
46662306a36Sopenharmony_ci			write_lock_bh(&lh->cache->trees[index].u.spinlock);
46762306a36Sopenharmony_ci		else
46862306a36Sopenharmony_ci			down_write(&lh->cache->trees[index].u.lock);
46962306a36Sopenharmony_ci	} else {
47062306a36Sopenharmony_ci		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
47162306a36Sopenharmony_ci			read_lock_bh(&lh->cache->trees[index].u.spinlock);
47262306a36Sopenharmony_ci		else
47362306a36Sopenharmony_ci			down_read(&lh->cache->trees[index].u.lock);
47462306a36Sopenharmony_ci	}
47562306a36Sopenharmony_ci}
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_cistatic void __lh_unlock(struct lock_history *lh, unsigned int index)
47862306a36Sopenharmony_ci{
47962306a36Sopenharmony_ci	if (lh->write) {
48062306a36Sopenharmony_ci		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
48162306a36Sopenharmony_ci			write_unlock_bh(&lh->cache->trees[index].u.spinlock);
48262306a36Sopenharmony_ci		else
48362306a36Sopenharmony_ci			up_write(&lh->cache->trees[index].u.lock);
48462306a36Sopenharmony_ci	} else {
48562306a36Sopenharmony_ci		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
48662306a36Sopenharmony_ci			read_unlock_bh(&lh->cache->trees[index].u.spinlock);
48762306a36Sopenharmony_ci		else
48862306a36Sopenharmony_ci			up_read(&lh->cache->trees[index].u.lock);
48962306a36Sopenharmony_ci	}
49062306a36Sopenharmony_ci}
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci/*
49362306a36Sopenharmony_ci * Make sure you call this since it will unlock the final lock.
49462306a36Sopenharmony_ci */
49562306a36Sopenharmony_cistatic void lh_exit(struct lock_history *lh)
49662306a36Sopenharmony_ci{
49762306a36Sopenharmony_ci	if (lh->previous != lh->no_previous) {
49862306a36Sopenharmony_ci		__lh_unlock(lh, lh->previous);
49962306a36Sopenharmony_ci		lh->previous = lh->no_previous;
50062306a36Sopenharmony_ci	}
50162306a36Sopenharmony_ci}
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci/*
50462306a36Sopenharmony_ci * Named 'next' because there is no corresponding
50562306a36Sopenharmony_ci * 'up/unlock' call since it's done automatically.
50662306a36Sopenharmony_ci */
50762306a36Sopenharmony_cistatic void lh_next(struct lock_history *lh, sector_t b)
50862306a36Sopenharmony_ci{
50962306a36Sopenharmony_ci	unsigned int index = cache_index(b, lh->no_previous); /* no_previous is num_locks */
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci	if (lh->previous != lh->no_previous) {
51262306a36Sopenharmony_ci		if (lh->previous != index) {
51362306a36Sopenharmony_ci			__lh_unlock(lh, lh->previous);
51462306a36Sopenharmony_ci			__lh_lock(lh, index);
51562306a36Sopenharmony_ci			lh->previous = index;
51662306a36Sopenharmony_ci		}
51762306a36Sopenharmony_ci	} else {
51862306a36Sopenharmony_ci		__lh_lock(lh, index);
51962306a36Sopenharmony_ci		lh->previous = index;
52062306a36Sopenharmony_ci	}
52162306a36Sopenharmony_ci}
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_cistatic inline struct dm_buffer *le_to_buffer(struct lru_entry *le)
52462306a36Sopenharmony_ci{
52562306a36Sopenharmony_ci	return container_of(le, struct dm_buffer, lru);
52662306a36Sopenharmony_ci}
52762306a36Sopenharmony_ci
52862306a36Sopenharmony_cistatic struct dm_buffer *list_to_buffer(struct list_head *l)
52962306a36Sopenharmony_ci{
53062306a36Sopenharmony_ci	struct lru_entry *le = list_entry(l, struct lru_entry, list);
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci	if (!le)
53362306a36Sopenharmony_ci		return NULL;
53462306a36Sopenharmony_ci
53562306a36Sopenharmony_ci	return le_to_buffer(le);
53662306a36Sopenharmony_ci}
53762306a36Sopenharmony_ci
53862306a36Sopenharmony_cistatic void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks, bool no_sleep)
53962306a36Sopenharmony_ci{
54062306a36Sopenharmony_ci	unsigned int i;
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci	bc->num_locks = num_locks;
54362306a36Sopenharmony_ci	bc->no_sleep = no_sleep;
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ci	for (i = 0; i < bc->num_locks; i++) {
54662306a36Sopenharmony_ci		if (no_sleep)
54762306a36Sopenharmony_ci			rwlock_init(&bc->trees[i].u.spinlock);
54862306a36Sopenharmony_ci		else
54962306a36Sopenharmony_ci			init_rwsem(&bc->trees[i].u.lock);
55062306a36Sopenharmony_ci		bc->trees[i].root = RB_ROOT;
55162306a36Sopenharmony_ci	}
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci	lru_init(&bc->lru[LIST_CLEAN]);
55462306a36Sopenharmony_ci	lru_init(&bc->lru[LIST_DIRTY]);
55562306a36Sopenharmony_ci}
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_cistatic void cache_destroy(struct dm_buffer_cache *bc)
55862306a36Sopenharmony_ci{
55962306a36Sopenharmony_ci	unsigned int i;
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_ci	for (i = 0; i < bc->num_locks; i++)
56262306a36Sopenharmony_ci		WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ci	lru_destroy(&bc->lru[LIST_CLEAN]);
56562306a36Sopenharmony_ci	lru_destroy(&bc->lru[LIST_DIRTY]);
56662306a36Sopenharmony_ci}
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci/*--------------*/
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci/*
57162306a36Sopenharmony_ci * not threadsafe, or racey depending how you look at it
57262306a36Sopenharmony_ci */
57362306a36Sopenharmony_cistatic inline unsigned long cache_count(struct dm_buffer_cache *bc, int list_mode)
57462306a36Sopenharmony_ci{
57562306a36Sopenharmony_ci	return bc->lru[list_mode].count;
57662306a36Sopenharmony_ci}
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_cistatic inline unsigned long cache_total(struct dm_buffer_cache *bc)
57962306a36Sopenharmony_ci{
58062306a36Sopenharmony_ci	return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY);
58162306a36Sopenharmony_ci}
58262306a36Sopenharmony_ci
58362306a36Sopenharmony_ci/*--------------*/
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci/*
58662306a36Sopenharmony_ci * Gets a specific buffer, indexed by block.
58762306a36Sopenharmony_ci * If the buffer is found then its holder count will be incremented and
58862306a36Sopenharmony_ci * lru_reference will be called.
58962306a36Sopenharmony_ci *
59062306a36Sopenharmony_ci * threadsafe
59162306a36Sopenharmony_ci */
59262306a36Sopenharmony_cistatic struct dm_buffer *__cache_get(const struct rb_root *root, sector_t block)
59362306a36Sopenharmony_ci{
59462306a36Sopenharmony_ci	struct rb_node *n = root->rb_node;
59562306a36Sopenharmony_ci	struct dm_buffer *b;
59662306a36Sopenharmony_ci
59762306a36Sopenharmony_ci	while (n) {
59862306a36Sopenharmony_ci		b = container_of(n, struct dm_buffer, node);
59962306a36Sopenharmony_ci
60062306a36Sopenharmony_ci		if (b->block == block)
60162306a36Sopenharmony_ci			return b;
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci		n = block < b->block ? n->rb_left : n->rb_right;
60462306a36Sopenharmony_ci	}
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ci	return NULL;
60762306a36Sopenharmony_ci}
60862306a36Sopenharmony_ci
60962306a36Sopenharmony_cistatic void __cache_inc_buffer(struct dm_buffer *b)
61062306a36Sopenharmony_ci{
61162306a36Sopenharmony_ci	atomic_inc(&b->hold_count);
61262306a36Sopenharmony_ci	WRITE_ONCE(b->last_accessed, jiffies);
61362306a36Sopenharmony_ci}
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_cistatic struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block)
61662306a36Sopenharmony_ci{
61762306a36Sopenharmony_ci	struct dm_buffer *b;
61862306a36Sopenharmony_ci
61962306a36Sopenharmony_ci	cache_read_lock(bc, block);
62062306a36Sopenharmony_ci	b = __cache_get(&bc->trees[cache_index(block, bc->num_locks)].root, block);
62162306a36Sopenharmony_ci	if (b) {
62262306a36Sopenharmony_ci		lru_reference(&b->lru);
62362306a36Sopenharmony_ci		__cache_inc_buffer(b);
62462306a36Sopenharmony_ci	}
62562306a36Sopenharmony_ci	cache_read_unlock(bc, block);
62662306a36Sopenharmony_ci
62762306a36Sopenharmony_ci	return b;
62862306a36Sopenharmony_ci}
62962306a36Sopenharmony_ci
63062306a36Sopenharmony_ci/*--------------*/
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci/*
63362306a36Sopenharmony_ci * Returns true if the hold count hits zero.
63462306a36Sopenharmony_ci * threadsafe
63562306a36Sopenharmony_ci */
63662306a36Sopenharmony_cistatic bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b)
63762306a36Sopenharmony_ci{
63862306a36Sopenharmony_ci	bool r;
63962306a36Sopenharmony_ci
64062306a36Sopenharmony_ci	cache_read_lock(bc, b->block);
64162306a36Sopenharmony_ci	BUG_ON(!atomic_read(&b->hold_count));
64262306a36Sopenharmony_ci	r = atomic_dec_and_test(&b->hold_count);
64362306a36Sopenharmony_ci	cache_read_unlock(bc, b->block);
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ci	return r;
64662306a36Sopenharmony_ci}
64762306a36Sopenharmony_ci
64862306a36Sopenharmony_ci/*--------------*/
64962306a36Sopenharmony_ci
65062306a36Sopenharmony_citypedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
65162306a36Sopenharmony_ci
65262306a36Sopenharmony_ci/*
65362306a36Sopenharmony_ci * Evicts a buffer based on a predicate.  The oldest buffer that
65462306a36Sopenharmony_ci * matches the predicate will be selected.  In addition to the
65562306a36Sopenharmony_ci * predicate the hold_count of the selected buffer will be zero.
65662306a36Sopenharmony_ci */
65762306a36Sopenharmony_cistruct evict_wrapper {
65862306a36Sopenharmony_ci	struct lock_history *lh;
65962306a36Sopenharmony_ci	b_predicate pred;
66062306a36Sopenharmony_ci	void *context;
66162306a36Sopenharmony_ci};
66262306a36Sopenharmony_ci
66362306a36Sopenharmony_ci/*
66462306a36Sopenharmony_ci * Wraps the buffer predicate turning it into an lru predicate.  Adds
66562306a36Sopenharmony_ci * extra test for hold_count.
66662306a36Sopenharmony_ci */
66762306a36Sopenharmony_cistatic enum evict_result __evict_pred(struct lru_entry *le, void *context)
66862306a36Sopenharmony_ci{
66962306a36Sopenharmony_ci	struct evict_wrapper *w = context;
67062306a36Sopenharmony_ci	struct dm_buffer *b = le_to_buffer(le);
67162306a36Sopenharmony_ci
67262306a36Sopenharmony_ci	lh_next(w->lh, b->block);
67362306a36Sopenharmony_ci
67462306a36Sopenharmony_ci	if (atomic_read(&b->hold_count))
67562306a36Sopenharmony_ci		return ER_DONT_EVICT;
67662306a36Sopenharmony_ci
67762306a36Sopenharmony_ci	return w->pred(b, w->context);
67862306a36Sopenharmony_ci}
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_cistatic struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode,
68162306a36Sopenharmony_ci				       b_predicate pred, void *context,
68262306a36Sopenharmony_ci				       struct lock_history *lh)
68362306a36Sopenharmony_ci{
68462306a36Sopenharmony_ci	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
68562306a36Sopenharmony_ci	struct lru_entry *le;
68662306a36Sopenharmony_ci	struct dm_buffer *b;
68762306a36Sopenharmony_ci
68862306a36Sopenharmony_ci	le = lru_evict(&bc->lru[list_mode], __evict_pred, &w, bc->no_sleep);
68962306a36Sopenharmony_ci	if (!le)
69062306a36Sopenharmony_ci		return NULL;
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci	b = le_to_buffer(le);
69362306a36Sopenharmony_ci	/* __evict_pred will have locked the appropriate tree. */
69462306a36Sopenharmony_ci	rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
69562306a36Sopenharmony_ci
69662306a36Sopenharmony_ci	return b;
69762306a36Sopenharmony_ci}
69862306a36Sopenharmony_ci
69962306a36Sopenharmony_cistatic struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
70062306a36Sopenharmony_ci				     b_predicate pred, void *context)
70162306a36Sopenharmony_ci{
70262306a36Sopenharmony_ci	struct dm_buffer *b;
70362306a36Sopenharmony_ci	struct lock_history lh;
70462306a36Sopenharmony_ci
70562306a36Sopenharmony_ci	lh_init(&lh, bc, true);
70662306a36Sopenharmony_ci	b = __cache_evict(bc, list_mode, pred, context, &lh);
70762306a36Sopenharmony_ci	lh_exit(&lh);
70862306a36Sopenharmony_ci
70962306a36Sopenharmony_ci	return b;
71062306a36Sopenharmony_ci}
71162306a36Sopenharmony_ci
71262306a36Sopenharmony_ci/*--------------*/
71362306a36Sopenharmony_ci
71462306a36Sopenharmony_ci/*
71562306a36Sopenharmony_ci * Mark a buffer as clean or dirty. Not threadsafe.
71662306a36Sopenharmony_ci */
71762306a36Sopenharmony_cistatic void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode)
71862306a36Sopenharmony_ci{
71962306a36Sopenharmony_ci	cache_write_lock(bc, b->block);
72062306a36Sopenharmony_ci	if (list_mode != b->list_mode) {
72162306a36Sopenharmony_ci		lru_remove(&bc->lru[b->list_mode], &b->lru);
72262306a36Sopenharmony_ci		b->list_mode = list_mode;
72362306a36Sopenharmony_ci		lru_insert(&bc->lru[b->list_mode], &b->lru);
72462306a36Sopenharmony_ci	}
72562306a36Sopenharmony_ci	cache_write_unlock(bc, b->block);
72662306a36Sopenharmony_ci}
72762306a36Sopenharmony_ci
72862306a36Sopenharmony_ci/*--------------*/
72962306a36Sopenharmony_ci
73062306a36Sopenharmony_ci/*
73162306a36Sopenharmony_ci * Runs through the lru associated with 'old_mode', if the predicate matches then
73262306a36Sopenharmony_ci * it moves them to 'new_mode'.  Not threadsafe.
73362306a36Sopenharmony_ci */
73462306a36Sopenharmony_cistatic void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
73562306a36Sopenharmony_ci			      b_predicate pred, void *context, struct lock_history *lh)
73662306a36Sopenharmony_ci{
73762306a36Sopenharmony_ci	struct lru_entry *le;
73862306a36Sopenharmony_ci	struct dm_buffer *b;
73962306a36Sopenharmony_ci	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
74062306a36Sopenharmony_ci
74162306a36Sopenharmony_ci	while (true) {
74262306a36Sopenharmony_ci		le = lru_evict(&bc->lru[old_mode], __evict_pred, &w, bc->no_sleep);
74362306a36Sopenharmony_ci		if (!le)
74462306a36Sopenharmony_ci			break;
74562306a36Sopenharmony_ci
74662306a36Sopenharmony_ci		b = le_to_buffer(le);
74762306a36Sopenharmony_ci		b->list_mode = new_mode;
74862306a36Sopenharmony_ci		lru_insert(&bc->lru[b->list_mode], &b->lru);
74962306a36Sopenharmony_ci	}
75062306a36Sopenharmony_ci}
75162306a36Sopenharmony_ci
75262306a36Sopenharmony_cistatic void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
75362306a36Sopenharmony_ci			    b_predicate pred, void *context)
75462306a36Sopenharmony_ci{
75562306a36Sopenharmony_ci	struct lock_history lh;
75662306a36Sopenharmony_ci
75762306a36Sopenharmony_ci	lh_init(&lh, bc, true);
75862306a36Sopenharmony_ci	__cache_mark_many(bc, old_mode, new_mode, pred, context, &lh);
75962306a36Sopenharmony_ci	lh_exit(&lh);
76062306a36Sopenharmony_ci}
76162306a36Sopenharmony_ci
76262306a36Sopenharmony_ci/*--------------*/
76362306a36Sopenharmony_ci
76462306a36Sopenharmony_ci/*
76562306a36Sopenharmony_ci * Iterates through all clean or dirty entries calling a function for each
76662306a36Sopenharmony_ci * entry.  The callback may terminate the iteration early.  Not threadsafe.
76762306a36Sopenharmony_ci */
76862306a36Sopenharmony_ci
76962306a36Sopenharmony_ci/*
77062306a36Sopenharmony_ci * Iterator functions should return one of these actions to indicate
77162306a36Sopenharmony_ci * how the iteration should proceed.
77262306a36Sopenharmony_ci */
77362306a36Sopenharmony_cienum it_action {
77462306a36Sopenharmony_ci	IT_NEXT,
77562306a36Sopenharmony_ci	IT_COMPLETE,
77662306a36Sopenharmony_ci};
77762306a36Sopenharmony_ci
77862306a36Sopenharmony_citypedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
77962306a36Sopenharmony_ci
78062306a36Sopenharmony_cistatic void __cache_iterate(struct dm_buffer_cache *bc, int list_mode,
78162306a36Sopenharmony_ci			    iter_fn fn, void *context, struct lock_history *lh)
78262306a36Sopenharmony_ci{
78362306a36Sopenharmony_ci	struct lru *lru = &bc->lru[list_mode];
78462306a36Sopenharmony_ci	struct lru_entry *le, *first;
78562306a36Sopenharmony_ci
78662306a36Sopenharmony_ci	if (!lru->cursor)
78762306a36Sopenharmony_ci		return;
78862306a36Sopenharmony_ci
78962306a36Sopenharmony_ci	first = le = to_le(lru->cursor);
79062306a36Sopenharmony_ci	do {
79162306a36Sopenharmony_ci		struct dm_buffer *b = le_to_buffer(le);
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci		lh_next(lh, b->block);
79462306a36Sopenharmony_ci
79562306a36Sopenharmony_ci		switch (fn(b, context)) {
79662306a36Sopenharmony_ci		case IT_NEXT:
79762306a36Sopenharmony_ci			break;
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_ci		case IT_COMPLETE:
80062306a36Sopenharmony_ci			return;
80162306a36Sopenharmony_ci		}
80262306a36Sopenharmony_ci		cond_resched();
80362306a36Sopenharmony_ci
80462306a36Sopenharmony_ci		le = to_le(le->list.next);
80562306a36Sopenharmony_ci	} while (le != first);
80662306a36Sopenharmony_ci}
80762306a36Sopenharmony_ci
80862306a36Sopenharmony_cistatic void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
80962306a36Sopenharmony_ci			  iter_fn fn, void *context)
81062306a36Sopenharmony_ci{
81162306a36Sopenharmony_ci	struct lock_history lh;
81262306a36Sopenharmony_ci
81362306a36Sopenharmony_ci	lh_init(&lh, bc, false);
81462306a36Sopenharmony_ci	__cache_iterate(bc, list_mode, fn, context, &lh);
81562306a36Sopenharmony_ci	lh_exit(&lh);
81662306a36Sopenharmony_ci}
81762306a36Sopenharmony_ci
81862306a36Sopenharmony_ci/*--------------*/
81962306a36Sopenharmony_ci
82062306a36Sopenharmony_ci/*
82162306a36Sopenharmony_ci * Passes ownership of the buffer to the cache. Returns false if the
82262306a36Sopenharmony_ci * buffer was already present (in which case ownership does not pass).
82362306a36Sopenharmony_ci * eg, a race with another thread.
82462306a36Sopenharmony_ci *
82562306a36Sopenharmony_ci * Holder count should be 1 on insertion.
82662306a36Sopenharmony_ci *
82762306a36Sopenharmony_ci * Not threadsafe.
82862306a36Sopenharmony_ci */
82962306a36Sopenharmony_cistatic bool __cache_insert(struct rb_root *root, struct dm_buffer *b)
83062306a36Sopenharmony_ci{
83162306a36Sopenharmony_ci	struct rb_node **new = &root->rb_node, *parent = NULL;
83262306a36Sopenharmony_ci	struct dm_buffer *found;
83362306a36Sopenharmony_ci
83462306a36Sopenharmony_ci	while (*new) {
83562306a36Sopenharmony_ci		found = container_of(*new, struct dm_buffer, node);
83662306a36Sopenharmony_ci
83762306a36Sopenharmony_ci		if (found->block == b->block)
83862306a36Sopenharmony_ci			return false;
83962306a36Sopenharmony_ci
84062306a36Sopenharmony_ci		parent = *new;
84162306a36Sopenharmony_ci		new = b->block < found->block ?
84262306a36Sopenharmony_ci			&found->node.rb_left : &found->node.rb_right;
84362306a36Sopenharmony_ci	}
84462306a36Sopenharmony_ci
84562306a36Sopenharmony_ci	rb_link_node(&b->node, parent, new);
84662306a36Sopenharmony_ci	rb_insert_color(&b->node, root);
84762306a36Sopenharmony_ci
84862306a36Sopenharmony_ci	return true;
84962306a36Sopenharmony_ci}
85062306a36Sopenharmony_ci
85162306a36Sopenharmony_cistatic bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b)
85262306a36Sopenharmony_ci{
85362306a36Sopenharmony_ci	bool r;
85462306a36Sopenharmony_ci
85562306a36Sopenharmony_ci	if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE))
85662306a36Sopenharmony_ci		return false;
85762306a36Sopenharmony_ci
85862306a36Sopenharmony_ci	cache_write_lock(bc, b->block);
85962306a36Sopenharmony_ci	BUG_ON(atomic_read(&b->hold_count) != 1);
86062306a36Sopenharmony_ci	r = __cache_insert(&bc->trees[cache_index(b->block, bc->num_locks)].root, b);
86162306a36Sopenharmony_ci	if (r)
86262306a36Sopenharmony_ci		lru_insert(&bc->lru[b->list_mode], &b->lru);
86362306a36Sopenharmony_ci	cache_write_unlock(bc, b->block);
86462306a36Sopenharmony_ci
86562306a36Sopenharmony_ci	return r;
86662306a36Sopenharmony_ci}
86762306a36Sopenharmony_ci
86862306a36Sopenharmony_ci/*--------------*/
86962306a36Sopenharmony_ci
87062306a36Sopenharmony_ci/*
87162306a36Sopenharmony_ci * Removes buffer from cache, ownership of the buffer passes back to the caller.
87262306a36Sopenharmony_ci * Fails if the hold_count is not one (ie. the caller holds the only reference).
87362306a36Sopenharmony_ci *
87462306a36Sopenharmony_ci * Not threadsafe.
87562306a36Sopenharmony_ci */
87662306a36Sopenharmony_cistatic bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b)
87762306a36Sopenharmony_ci{
87862306a36Sopenharmony_ci	bool r;
87962306a36Sopenharmony_ci
88062306a36Sopenharmony_ci	cache_write_lock(bc, b->block);
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci	if (atomic_read(&b->hold_count) != 1) {
88362306a36Sopenharmony_ci		r = false;
88462306a36Sopenharmony_ci	} else {
88562306a36Sopenharmony_ci		r = true;
88662306a36Sopenharmony_ci		rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
88762306a36Sopenharmony_ci		lru_remove(&bc->lru[b->list_mode], &b->lru);
88862306a36Sopenharmony_ci	}
88962306a36Sopenharmony_ci
89062306a36Sopenharmony_ci	cache_write_unlock(bc, b->block);
89162306a36Sopenharmony_ci
89262306a36Sopenharmony_ci	return r;
89362306a36Sopenharmony_ci}
89462306a36Sopenharmony_ci
89562306a36Sopenharmony_ci/*--------------*/
89662306a36Sopenharmony_ci
89762306a36Sopenharmony_citypedef void (*b_release)(struct dm_buffer *);
89862306a36Sopenharmony_ci
89962306a36Sopenharmony_cistatic struct dm_buffer *__find_next(struct rb_root *root, sector_t block)
90062306a36Sopenharmony_ci{
90162306a36Sopenharmony_ci	struct rb_node *n = root->rb_node;
90262306a36Sopenharmony_ci	struct dm_buffer *b;
90362306a36Sopenharmony_ci	struct dm_buffer *best = NULL;
90462306a36Sopenharmony_ci
90562306a36Sopenharmony_ci	while (n) {
90662306a36Sopenharmony_ci		b = container_of(n, struct dm_buffer, node);
90762306a36Sopenharmony_ci
90862306a36Sopenharmony_ci		if (b->block == block)
90962306a36Sopenharmony_ci			return b;
91062306a36Sopenharmony_ci
91162306a36Sopenharmony_ci		if (block <= b->block) {
91262306a36Sopenharmony_ci			n = n->rb_left;
91362306a36Sopenharmony_ci			best = b;
91462306a36Sopenharmony_ci		} else {
91562306a36Sopenharmony_ci			n = n->rb_right;
91662306a36Sopenharmony_ci		}
91762306a36Sopenharmony_ci	}
91862306a36Sopenharmony_ci
91962306a36Sopenharmony_ci	return best;
92062306a36Sopenharmony_ci}
92162306a36Sopenharmony_ci
92262306a36Sopenharmony_cistatic void __remove_range(struct dm_buffer_cache *bc,
92362306a36Sopenharmony_ci			   struct rb_root *root,
92462306a36Sopenharmony_ci			   sector_t begin, sector_t end,
92562306a36Sopenharmony_ci			   b_predicate pred, b_release release)
92662306a36Sopenharmony_ci{
92762306a36Sopenharmony_ci	struct dm_buffer *b;
92862306a36Sopenharmony_ci
92962306a36Sopenharmony_ci	while (true) {
93062306a36Sopenharmony_ci		cond_resched();
93162306a36Sopenharmony_ci
93262306a36Sopenharmony_ci		b = __find_next(root, begin);
93362306a36Sopenharmony_ci		if (!b || (b->block >= end))
93462306a36Sopenharmony_ci			break;
93562306a36Sopenharmony_ci
93662306a36Sopenharmony_ci		begin = b->block + 1;
93762306a36Sopenharmony_ci
93862306a36Sopenharmony_ci		if (atomic_read(&b->hold_count))
93962306a36Sopenharmony_ci			continue;
94062306a36Sopenharmony_ci
94162306a36Sopenharmony_ci		if (pred(b, NULL) == ER_EVICT) {
94262306a36Sopenharmony_ci			rb_erase(&b->node, root);
94362306a36Sopenharmony_ci			lru_remove(&bc->lru[b->list_mode], &b->lru);
94462306a36Sopenharmony_ci			release(b);
94562306a36Sopenharmony_ci		}
94662306a36Sopenharmony_ci	}
94762306a36Sopenharmony_ci}
94862306a36Sopenharmony_ci
94962306a36Sopenharmony_cistatic void cache_remove_range(struct dm_buffer_cache *bc,
95062306a36Sopenharmony_ci			       sector_t begin, sector_t end,
95162306a36Sopenharmony_ci			       b_predicate pred, b_release release)
95262306a36Sopenharmony_ci{
95362306a36Sopenharmony_ci	unsigned int i;
95462306a36Sopenharmony_ci
95562306a36Sopenharmony_ci	BUG_ON(bc->no_sleep);
95662306a36Sopenharmony_ci	for (i = 0; i < bc->num_locks; i++) {
95762306a36Sopenharmony_ci		down_write(&bc->trees[i].u.lock);
95862306a36Sopenharmony_ci		__remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
95962306a36Sopenharmony_ci		up_write(&bc->trees[i].u.lock);
96062306a36Sopenharmony_ci	}
96162306a36Sopenharmony_ci}
96262306a36Sopenharmony_ci
96362306a36Sopenharmony_ci/*----------------------------------------------------------------*/
96462306a36Sopenharmony_ci
96562306a36Sopenharmony_ci/*
96662306a36Sopenharmony_ci * Linking of buffers:
96762306a36Sopenharmony_ci *	All buffers are linked to buffer_cache with their node field.
96862306a36Sopenharmony_ci *
96962306a36Sopenharmony_ci *	Clean buffers that are not being written (B_WRITING not set)
97062306a36Sopenharmony_ci *	are linked to lru[LIST_CLEAN] with their lru_list field.
97162306a36Sopenharmony_ci *
97262306a36Sopenharmony_ci *	Dirty and clean buffers that are being written are linked to
97362306a36Sopenharmony_ci *	lru[LIST_DIRTY] with their lru_list field. When the write
97462306a36Sopenharmony_ci *	finishes, the buffer cannot be relinked immediately (because we
97562306a36Sopenharmony_ci *	are in an interrupt context and relinking requires process
97662306a36Sopenharmony_ci *	context), so some clean-not-writing buffers can be held on
97762306a36Sopenharmony_ci *	dirty_lru too.  They are later added to lru in the process
97862306a36Sopenharmony_ci *	context.
97962306a36Sopenharmony_ci */
98062306a36Sopenharmony_cistruct dm_bufio_client {
98162306a36Sopenharmony_ci	struct block_device *bdev;
98262306a36Sopenharmony_ci	unsigned int block_size;
98362306a36Sopenharmony_ci	s8 sectors_per_block_bits;
98462306a36Sopenharmony_ci
98562306a36Sopenharmony_ci	bool no_sleep;
98662306a36Sopenharmony_ci	struct mutex lock;
98762306a36Sopenharmony_ci	spinlock_t spinlock;
98862306a36Sopenharmony_ci
98962306a36Sopenharmony_ci	int async_write_error;
99062306a36Sopenharmony_ci
99162306a36Sopenharmony_ci	void (*alloc_callback)(struct dm_buffer *buf);
99262306a36Sopenharmony_ci	void (*write_callback)(struct dm_buffer *buf);
99362306a36Sopenharmony_ci	struct kmem_cache *slab_buffer;
99462306a36Sopenharmony_ci	struct kmem_cache *slab_cache;
99562306a36Sopenharmony_ci	struct dm_io_client *dm_io;
99662306a36Sopenharmony_ci
99762306a36Sopenharmony_ci	struct list_head reserved_buffers;
99862306a36Sopenharmony_ci	unsigned int need_reserved_buffers;
99962306a36Sopenharmony_ci
100062306a36Sopenharmony_ci	unsigned int minimum_buffers;
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_ci	sector_t start;
100362306a36Sopenharmony_ci
100462306a36Sopenharmony_ci	struct shrinker shrinker;
100562306a36Sopenharmony_ci	struct work_struct shrink_work;
100662306a36Sopenharmony_ci	atomic_long_t need_shrink;
100762306a36Sopenharmony_ci
100862306a36Sopenharmony_ci	wait_queue_head_t free_buffer_wait;
100962306a36Sopenharmony_ci
101062306a36Sopenharmony_ci	struct list_head client_list;
101162306a36Sopenharmony_ci
101262306a36Sopenharmony_ci	/*
101362306a36Sopenharmony_ci	 * Used by global_cleanup to sort the clients list.
101462306a36Sopenharmony_ci	 */
101562306a36Sopenharmony_ci	unsigned long oldest_buffer;
101662306a36Sopenharmony_ci
101762306a36Sopenharmony_ci	struct dm_buffer_cache cache; /* must be last member */
101862306a36Sopenharmony_ci};
101962306a36Sopenharmony_ci
102062306a36Sopenharmony_ci/*----------------------------------------------------------------*/
102162306a36Sopenharmony_ci
102262306a36Sopenharmony_ci#define dm_bufio_in_request()	(!!current->bio_list)
102362306a36Sopenharmony_ci
102462306a36Sopenharmony_cistatic void dm_bufio_lock(struct dm_bufio_client *c)
102562306a36Sopenharmony_ci{
102662306a36Sopenharmony_ci	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
102762306a36Sopenharmony_ci		spin_lock_bh(&c->spinlock);
102862306a36Sopenharmony_ci	else
102962306a36Sopenharmony_ci		mutex_lock_nested(&c->lock, dm_bufio_in_request());
103062306a36Sopenharmony_ci}
103162306a36Sopenharmony_ci
103262306a36Sopenharmony_cistatic void dm_bufio_unlock(struct dm_bufio_client *c)
103362306a36Sopenharmony_ci{
103462306a36Sopenharmony_ci	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
103562306a36Sopenharmony_ci		spin_unlock_bh(&c->spinlock);
103662306a36Sopenharmony_ci	else
103762306a36Sopenharmony_ci		mutex_unlock(&c->lock);
103862306a36Sopenharmony_ci}
103962306a36Sopenharmony_ci
104062306a36Sopenharmony_ci/*----------------------------------------------------------------*/
104162306a36Sopenharmony_ci
104262306a36Sopenharmony_ci/*
104362306a36Sopenharmony_ci * Default cache size: available memory divided by the ratio.
104462306a36Sopenharmony_ci */
104562306a36Sopenharmony_cistatic unsigned long dm_bufio_default_cache_size;
104662306a36Sopenharmony_ci
104762306a36Sopenharmony_ci/*
104862306a36Sopenharmony_ci * Total cache size set by the user.
104962306a36Sopenharmony_ci */
105062306a36Sopenharmony_cistatic unsigned long dm_bufio_cache_size;
105162306a36Sopenharmony_ci
105262306a36Sopenharmony_ci/*
105362306a36Sopenharmony_ci * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
105462306a36Sopenharmony_ci * at any time.  If it disagrees, the user has changed cache size.
105562306a36Sopenharmony_ci */
105662306a36Sopenharmony_cistatic unsigned long dm_bufio_cache_size_latch;
105762306a36Sopenharmony_ci
105862306a36Sopenharmony_cistatic DEFINE_SPINLOCK(global_spinlock);
105962306a36Sopenharmony_ci
106062306a36Sopenharmony_ci/*
106162306a36Sopenharmony_ci * Buffers are freed after this timeout
106262306a36Sopenharmony_ci */
106362306a36Sopenharmony_cistatic unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
106462306a36Sopenharmony_cistatic unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
106562306a36Sopenharmony_ci
106662306a36Sopenharmony_cistatic unsigned long dm_bufio_peak_allocated;
106762306a36Sopenharmony_cistatic unsigned long dm_bufio_allocated_kmem_cache;
106862306a36Sopenharmony_cistatic unsigned long dm_bufio_allocated_get_free_pages;
106962306a36Sopenharmony_cistatic unsigned long dm_bufio_allocated_vmalloc;
107062306a36Sopenharmony_cistatic unsigned long dm_bufio_current_allocated;
107162306a36Sopenharmony_ci
107262306a36Sopenharmony_ci/*----------------------------------------------------------------*/
107362306a36Sopenharmony_ci
107462306a36Sopenharmony_ci/*
107562306a36Sopenharmony_ci * The current number of clients.
107662306a36Sopenharmony_ci */
107762306a36Sopenharmony_cistatic int dm_bufio_client_count;
107862306a36Sopenharmony_ci
107962306a36Sopenharmony_ci/*
108062306a36Sopenharmony_ci * The list of all clients.
108162306a36Sopenharmony_ci */
108262306a36Sopenharmony_cistatic LIST_HEAD(dm_bufio_all_clients);
108362306a36Sopenharmony_ci
108462306a36Sopenharmony_ci/*
108562306a36Sopenharmony_ci * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
108662306a36Sopenharmony_ci */
108762306a36Sopenharmony_cistatic DEFINE_MUTEX(dm_bufio_clients_lock);
108862306a36Sopenharmony_ci
108962306a36Sopenharmony_cistatic struct workqueue_struct *dm_bufio_wq;
109062306a36Sopenharmony_cistatic struct delayed_work dm_bufio_cleanup_old_work;
109162306a36Sopenharmony_cistatic struct work_struct dm_bufio_replacement_work;
109262306a36Sopenharmony_ci
109362306a36Sopenharmony_ci
109462306a36Sopenharmony_ci#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
109562306a36Sopenharmony_cistatic void buffer_record_stack(struct dm_buffer *b)
109662306a36Sopenharmony_ci{
109762306a36Sopenharmony_ci	b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
109862306a36Sopenharmony_ci}
109962306a36Sopenharmony_ci#endif
110062306a36Sopenharmony_ci
110162306a36Sopenharmony_ci/*----------------------------------------------------------------*/
110262306a36Sopenharmony_ci
110362306a36Sopenharmony_cistatic void adjust_total_allocated(struct dm_buffer *b, bool unlink)
110462306a36Sopenharmony_ci{
110562306a36Sopenharmony_ci	unsigned char data_mode;
110662306a36Sopenharmony_ci	long diff;
110762306a36Sopenharmony_ci
110862306a36Sopenharmony_ci	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
110962306a36Sopenharmony_ci		&dm_bufio_allocated_kmem_cache,
111062306a36Sopenharmony_ci		&dm_bufio_allocated_get_free_pages,
111162306a36Sopenharmony_ci		&dm_bufio_allocated_vmalloc,
111262306a36Sopenharmony_ci	};
111362306a36Sopenharmony_ci
111462306a36Sopenharmony_ci	data_mode = b->data_mode;
111562306a36Sopenharmony_ci	diff = (long)b->c->block_size;
111662306a36Sopenharmony_ci	if (unlink)
111762306a36Sopenharmony_ci		diff = -diff;
111862306a36Sopenharmony_ci
111962306a36Sopenharmony_ci	spin_lock(&global_spinlock);
112062306a36Sopenharmony_ci
112162306a36Sopenharmony_ci	*class_ptr[data_mode] += diff;
112262306a36Sopenharmony_ci
112362306a36Sopenharmony_ci	dm_bufio_current_allocated += diff;
112462306a36Sopenharmony_ci
112562306a36Sopenharmony_ci	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
112662306a36Sopenharmony_ci		dm_bufio_peak_allocated = dm_bufio_current_allocated;
112762306a36Sopenharmony_ci
112862306a36Sopenharmony_ci	if (!unlink) {
112962306a36Sopenharmony_ci		if (dm_bufio_current_allocated > dm_bufio_cache_size)
113062306a36Sopenharmony_ci			queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
113162306a36Sopenharmony_ci	}
113262306a36Sopenharmony_ci
113362306a36Sopenharmony_ci	spin_unlock(&global_spinlock);
113462306a36Sopenharmony_ci}
113562306a36Sopenharmony_ci
113662306a36Sopenharmony_ci/*
113762306a36Sopenharmony_ci * Change the number of clients and recalculate per-client limit.
113862306a36Sopenharmony_ci */
113962306a36Sopenharmony_cistatic void __cache_size_refresh(void)
114062306a36Sopenharmony_ci{
114162306a36Sopenharmony_ci	if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock)))
114262306a36Sopenharmony_ci		return;
114362306a36Sopenharmony_ci	if (WARN_ON(dm_bufio_client_count < 0))
114462306a36Sopenharmony_ci		return;
114562306a36Sopenharmony_ci
114662306a36Sopenharmony_ci	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
114762306a36Sopenharmony_ci
114862306a36Sopenharmony_ci	/*
114962306a36Sopenharmony_ci	 * Use default if set to 0 and report the actual cache size used.
115062306a36Sopenharmony_ci	 */
115162306a36Sopenharmony_ci	if (!dm_bufio_cache_size_latch) {
115262306a36Sopenharmony_ci		(void)cmpxchg(&dm_bufio_cache_size, 0,
115362306a36Sopenharmony_ci			      dm_bufio_default_cache_size);
115462306a36Sopenharmony_ci		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
115562306a36Sopenharmony_ci	}
115662306a36Sopenharmony_ci}
115762306a36Sopenharmony_ci
115862306a36Sopenharmony_ci/*
115962306a36Sopenharmony_ci * Allocating buffer data.
116062306a36Sopenharmony_ci *
116162306a36Sopenharmony_ci * Small buffers are allocated with kmem_cache, to use space optimally.
116262306a36Sopenharmony_ci *
116362306a36Sopenharmony_ci * For large buffers, we choose between get_free_pages and vmalloc.
116462306a36Sopenharmony_ci * Each has advantages and disadvantages.
116562306a36Sopenharmony_ci *
116662306a36Sopenharmony_ci * __get_free_pages can randomly fail if the memory is fragmented.
116762306a36Sopenharmony_ci * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
116862306a36Sopenharmony_ci * as low as 128M) so using it for caching is not appropriate.
116962306a36Sopenharmony_ci *
117062306a36Sopenharmony_ci * If the allocation may fail we use __get_free_pages. Memory fragmentation
117162306a36Sopenharmony_ci * won't have a fatal effect here, but it just causes flushes of some other
117262306a36Sopenharmony_ci * buffers and more I/O will be performed. Don't use __get_free_pages if it
117362306a36Sopenharmony_ci * always fails (i.e. order > MAX_ORDER).
117462306a36Sopenharmony_ci *
117562306a36Sopenharmony_ci * If the allocation shouldn't fail we use __vmalloc. This is only for the
117662306a36Sopenharmony_ci * initial reserve allocation, so there's no risk of wasting all vmalloc
117762306a36Sopenharmony_ci * space.
117862306a36Sopenharmony_ci */
117962306a36Sopenharmony_cistatic void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
118062306a36Sopenharmony_ci			       unsigned char *data_mode)
118162306a36Sopenharmony_ci{
118262306a36Sopenharmony_ci	if (unlikely(c->slab_cache != NULL)) {
118362306a36Sopenharmony_ci		*data_mode = DATA_MODE_SLAB;
118462306a36Sopenharmony_ci		return kmem_cache_alloc(c->slab_cache, gfp_mask);
118562306a36Sopenharmony_ci	}
118662306a36Sopenharmony_ci
118762306a36Sopenharmony_ci	if (c->block_size <= KMALLOC_MAX_SIZE &&
118862306a36Sopenharmony_ci	    gfp_mask & __GFP_NORETRY) {
118962306a36Sopenharmony_ci		*data_mode = DATA_MODE_GET_FREE_PAGES;
119062306a36Sopenharmony_ci		return (void *)__get_free_pages(gfp_mask,
119162306a36Sopenharmony_ci						c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
119262306a36Sopenharmony_ci	}
119362306a36Sopenharmony_ci
119462306a36Sopenharmony_ci	*data_mode = DATA_MODE_VMALLOC;
119562306a36Sopenharmony_ci
119662306a36Sopenharmony_ci	return __vmalloc(c->block_size, gfp_mask);
119762306a36Sopenharmony_ci}
119862306a36Sopenharmony_ci
119962306a36Sopenharmony_ci/*
120062306a36Sopenharmony_ci * Free buffer's data.
120162306a36Sopenharmony_ci */
120262306a36Sopenharmony_cistatic void free_buffer_data(struct dm_bufio_client *c,
120362306a36Sopenharmony_ci			     void *data, unsigned char data_mode)
120462306a36Sopenharmony_ci{
120562306a36Sopenharmony_ci	switch (data_mode) {
120662306a36Sopenharmony_ci	case DATA_MODE_SLAB:
120762306a36Sopenharmony_ci		kmem_cache_free(c->slab_cache, data);
120862306a36Sopenharmony_ci		break;
120962306a36Sopenharmony_ci
121062306a36Sopenharmony_ci	case DATA_MODE_GET_FREE_PAGES:
121162306a36Sopenharmony_ci		free_pages((unsigned long)data,
121262306a36Sopenharmony_ci			   c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
121362306a36Sopenharmony_ci		break;
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_ci	case DATA_MODE_VMALLOC:
121662306a36Sopenharmony_ci		vfree(data);
121762306a36Sopenharmony_ci		break;
121862306a36Sopenharmony_ci
121962306a36Sopenharmony_ci	default:
122062306a36Sopenharmony_ci		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
122162306a36Sopenharmony_ci		       data_mode);
122262306a36Sopenharmony_ci		BUG();
122362306a36Sopenharmony_ci	}
122462306a36Sopenharmony_ci}
122562306a36Sopenharmony_ci
122662306a36Sopenharmony_ci/*
122762306a36Sopenharmony_ci * Allocate buffer and its data.
122862306a36Sopenharmony_ci */
122962306a36Sopenharmony_cistatic struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
123062306a36Sopenharmony_ci{
123162306a36Sopenharmony_ci	struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
123262306a36Sopenharmony_ci
123362306a36Sopenharmony_ci	if (!b)
123462306a36Sopenharmony_ci		return NULL;
123562306a36Sopenharmony_ci
123662306a36Sopenharmony_ci	b->c = c;
123762306a36Sopenharmony_ci
123862306a36Sopenharmony_ci	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
123962306a36Sopenharmony_ci	if (!b->data) {
124062306a36Sopenharmony_ci		kmem_cache_free(c->slab_buffer, b);
124162306a36Sopenharmony_ci		return NULL;
124262306a36Sopenharmony_ci	}
124362306a36Sopenharmony_ci	adjust_total_allocated(b, false);
124462306a36Sopenharmony_ci
124562306a36Sopenharmony_ci#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
124662306a36Sopenharmony_ci	b->stack_len = 0;
124762306a36Sopenharmony_ci#endif
124862306a36Sopenharmony_ci	return b;
124962306a36Sopenharmony_ci}
125062306a36Sopenharmony_ci
125162306a36Sopenharmony_ci/*
125262306a36Sopenharmony_ci * Free buffer and its data.
125362306a36Sopenharmony_ci */
125462306a36Sopenharmony_cistatic void free_buffer(struct dm_buffer *b)
125562306a36Sopenharmony_ci{
125662306a36Sopenharmony_ci	struct dm_bufio_client *c = b->c;
125762306a36Sopenharmony_ci
125862306a36Sopenharmony_ci	adjust_total_allocated(b, true);
125962306a36Sopenharmony_ci	free_buffer_data(c, b->data, b->data_mode);
126062306a36Sopenharmony_ci	kmem_cache_free(c->slab_buffer, b);
126162306a36Sopenharmony_ci}
126262306a36Sopenharmony_ci
126362306a36Sopenharmony_ci/*
126462306a36Sopenharmony_ci *--------------------------------------------------------------------------
126562306a36Sopenharmony_ci * Submit I/O on the buffer.
126662306a36Sopenharmony_ci *
126762306a36Sopenharmony_ci * Bio interface is faster but it has some problems:
126862306a36Sopenharmony_ci *	the vector list is limited (increasing this limit increases
126962306a36Sopenharmony_ci *	memory-consumption per buffer, so it is not viable);
127062306a36Sopenharmony_ci *
127162306a36Sopenharmony_ci *	the memory must be direct-mapped, not vmalloced;
127262306a36Sopenharmony_ci *
127362306a36Sopenharmony_ci * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
127462306a36Sopenharmony_ci * it is not vmalloced, try using the bio interface.
127562306a36Sopenharmony_ci *
127662306a36Sopenharmony_ci * If the buffer is big, if it is vmalloced or if the underlying device
127762306a36Sopenharmony_ci * rejects the bio because it is too large, use dm-io layer to do the I/O.
127862306a36Sopenharmony_ci * The dm-io layer splits the I/O into multiple requests, avoiding the above
127962306a36Sopenharmony_ci * shortcomings.
128062306a36Sopenharmony_ci *--------------------------------------------------------------------------
128162306a36Sopenharmony_ci */
128262306a36Sopenharmony_ci
128362306a36Sopenharmony_ci/*
128462306a36Sopenharmony_ci * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
128562306a36Sopenharmony_ci * that the request was handled directly with bio interface.
128662306a36Sopenharmony_ci */
128762306a36Sopenharmony_cistatic void dmio_complete(unsigned long error, void *context)
128862306a36Sopenharmony_ci{
128962306a36Sopenharmony_ci	struct dm_buffer *b = context;
129062306a36Sopenharmony_ci
129162306a36Sopenharmony_ci	b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
129262306a36Sopenharmony_ci}
129362306a36Sopenharmony_ci
129462306a36Sopenharmony_cistatic void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
129562306a36Sopenharmony_ci		     unsigned int n_sectors, unsigned int offset)
129662306a36Sopenharmony_ci{
129762306a36Sopenharmony_ci	int r;
129862306a36Sopenharmony_ci	struct dm_io_request io_req = {
129962306a36Sopenharmony_ci		.bi_opf = op,
130062306a36Sopenharmony_ci		.notify.fn = dmio_complete,
130162306a36Sopenharmony_ci		.notify.context = b,
130262306a36Sopenharmony_ci		.client = b->c->dm_io,
130362306a36Sopenharmony_ci	};
130462306a36Sopenharmony_ci	struct dm_io_region region = {
130562306a36Sopenharmony_ci		.bdev = b->c->bdev,
130662306a36Sopenharmony_ci		.sector = sector,
130762306a36Sopenharmony_ci		.count = n_sectors,
130862306a36Sopenharmony_ci	};
130962306a36Sopenharmony_ci
131062306a36Sopenharmony_ci	if (b->data_mode != DATA_MODE_VMALLOC) {
131162306a36Sopenharmony_ci		io_req.mem.type = DM_IO_KMEM;
131262306a36Sopenharmony_ci		io_req.mem.ptr.addr = (char *)b->data + offset;
131362306a36Sopenharmony_ci	} else {
131462306a36Sopenharmony_ci		io_req.mem.type = DM_IO_VMA;
131562306a36Sopenharmony_ci		io_req.mem.ptr.vma = (char *)b->data + offset;
131662306a36Sopenharmony_ci	}
131762306a36Sopenharmony_ci
131862306a36Sopenharmony_ci	r = dm_io(&io_req, 1, &region, NULL, IOPRIO_DEFAULT);
131962306a36Sopenharmony_ci	if (unlikely(r))
132062306a36Sopenharmony_ci		b->end_io(b, errno_to_blk_status(r));
132162306a36Sopenharmony_ci}
132262306a36Sopenharmony_ci
132362306a36Sopenharmony_cistatic void bio_complete(struct bio *bio)
132462306a36Sopenharmony_ci{
132562306a36Sopenharmony_ci	struct dm_buffer *b = bio->bi_private;
132662306a36Sopenharmony_ci	blk_status_t status = bio->bi_status;
132762306a36Sopenharmony_ci
132862306a36Sopenharmony_ci	bio_uninit(bio);
132962306a36Sopenharmony_ci	kfree(bio);
133062306a36Sopenharmony_ci	b->end_io(b, status);
133162306a36Sopenharmony_ci}
133262306a36Sopenharmony_ci
133362306a36Sopenharmony_cistatic void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
133462306a36Sopenharmony_ci		    unsigned int n_sectors, unsigned int offset)
133562306a36Sopenharmony_ci{
133662306a36Sopenharmony_ci	struct bio *bio;
133762306a36Sopenharmony_ci	char *ptr;
133862306a36Sopenharmony_ci	unsigned int len;
133962306a36Sopenharmony_ci
134062306a36Sopenharmony_ci	bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
134162306a36Sopenharmony_ci	if (!bio) {
134262306a36Sopenharmony_ci		use_dmio(b, op, sector, n_sectors, offset);
134362306a36Sopenharmony_ci		return;
134462306a36Sopenharmony_ci	}
134562306a36Sopenharmony_ci	bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
134662306a36Sopenharmony_ci	bio->bi_iter.bi_sector = sector;
134762306a36Sopenharmony_ci	bio->bi_end_io = bio_complete;
134862306a36Sopenharmony_ci	bio->bi_private = b;
134962306a36Sopenharmony_ci
135062306a36Sopenharmony_ci	ptr = (char *)b->data + offset;
135162306a36Sopenharmony_ci	len = n_sectors << SECTOR_SHIFT;
135262306a36Sopenharmony_ci
135362306a36Sopenharmony_ci	__bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
135462306a36Sopenharmony_ci
135562306a36Sopenharmony_ci	submit_bio(bio);
135662306a36Sopenharmony_ci}
135762306a36Sopenharmony_ci
135862306a36Sopenharmony_cistatic inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
135962306a36Sopenharmony_ci{
136062306a36Sopenharmony_ci	sector_t sector;
136162306a36Sopenharmony_ci
136262306a36Sopenharmony_ci	if (likely(c->sectors_per_block_bits >= 0))
136362306a36Sopenharmony_ci		sector = block << c->sectors_per_block_bits;
136462306a36Sopenharmony_ci	else
136562306a36Sopenharmony_ci		sector = block * (c->block_size >> SECTOR_SHIFT);
136662306a36Sopenharmony_ci	sector += c->start;
136762306a36Sopenharmony_ci
136862306a36Sopenharmony_ci	return sector;
136962306a36Sopenharmony_ci}
137062306a36Sopenharmony_ci
137162306a36Sopenharmony_cistatic void submit_io(struct dm_buffer *b, enum req_op op,
137262306a36Sopenharmony_ci		      void (*end_io)(struct dm_buffer *, blk_status_t))
137362306a36Sopenharmony_ci{
137462306a36Sopenharmony_ci	unsigned int n_sectors;
137562306a36Sopenharmony_ci	sector_t sector;
137662306a36Sopenharmony_ci	unsigned int offset, end;
137762306a36Sopenharmony_ci
137862306a36Sopenharmony_ci	b->end_io = end_io;
137962306a36Sopenharmony_ci
138062306a36Sopenharmony_ci	sector = block_to_sector(b->c, b->block);
138162306a36Sopenharmony_ci
138262306a36Sopenharmony_ci	if (op != REQ_OP_WRITE) {
138362306a36Sopenharmony_ci		n_sectors = b->c->block_size >> SECTOR_SHIFT;
138462306a36Sopenharmony_ci		offset = 0;
138562306a36Sopenharmony_ci	} else {
138662306a36Sopenharmony_ci		if (b->c->write_callback)
138762306a36Sopenharmony_ci			b->c->write_callback(b);
138862306a36Sopenharmony_ci		offset = b->write_start;
138962306a36Sopenharmony_ci		end = b->write_end;
139062306a36Sopenharmony_ci		offset &= -DM_BUFIO_WRITE_ALIGN;
139162306a36Sopenharmony_ci		end += DM_BUFIO_WRITE_ALIGN - 1;
139262306a36Sopenharmony_ci		end &= -DM_BUFIO_WRITE_ALIGN;
139362306a36Sopenharmony_ci		if (unlikely(end > b->c->block_size))
139462306a36Sopenharmony_ci			end = b->c->block_size;
139562306a36Sopenharmony_ci
139662306a36Sopenharmony_ci		sector += offset >> SECTOR_SHIFT;
139762306a36Sopenharmony_ci		n_sectors = (end - offset) >> SECTOR_SHIFT;
139862306a36Sopenharmony_ci	}
139962306a36Sopenharmony_ci
140062306a36Sopenharmony_ci	if (b->data_mode != DATA_MODE_VMALLOC)
140162306a36Sopenharmony_ci		use_bio(b, op, sector, n_sectors, offset);
140262306a36Sopenharmony_ci	else
140362306a36Sopenharmony_ci		use_dmio(b, op, sector, n_sectors, offset);
140462306a36Sopenharmony_ci}
140562306a36Sopenharmony_ci
140662306a36Sopenharmony_ci/*
140762306a36Sopenharmony_ci *--------------------------------------------------------------
140862306a36Sopenharmony_ci * Writing dirty buffers
140962306a36Sopenharmony_ci *--------------------------------------------------------------
141062306a36Sopenharmony_ci */
141162306a36Sopenharmony_ci
141262306a36Sopenharmony_ci/*
141362306a36Sopenharmony_ci * The endio routine for write.
141462306a36Sopenharmony_ci *
141562306a36Sopenharmony_ci * Set the error, clear B_WRITING bit and wake anyone who was waiting on
141662306a36Sopenharmony_ci * it.
141762306a36Sopenharmony_ci */
141862306a36Sopenharmony_cistatic void write_endio(struct dm_buffer *b, blk_status_t status)
141962306a36Sopenharmony_ci{
142062306a36Sopenharmony_ci	b->write_error = status;
142162306a36Sopenharmony_ci	if (unlikely(status)) {
142262306a36Sopenharmony_ci		struct dm_bufio_client *c = b->c;
142362306a36Sopenharmony_ci
142462306a36Sopenharmony_ci		(void)cmpxchg(&c->async_write_error, 0,
142562306a36Sopenharmony_ci				blk_status_to_errno(status));
142662306a36Sopenharmony_ci	}
142762306a36Sopenharmony_ci
142862306a36Sopenharmony_ci	BUG_ON(!test_bit(B_WRITING, &b->state));
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_ci	smp_mb__before_atomic();
143162306a36Sopenharmony_ci	clear_bit(B_WRITING, &b->state);
143262306a36Sopenharmony_ci	smp_mb__after_atomic();
143362306a36Sopenharmony_ci
143462306a36Sopenharmony_ci	wake_up_bit(&b->state, B_WRITING);
143562306a36Sopenharmony_ci}
143662306a36Sopenharmony_ci
143762306a36Sopenharmony_ci/*
143862306a36Sopenharmony_ci * Initiate a write on a dirty buffer, but don't wait for it.
143962306a36Sopenharmony_ci *
144062306a36Sopenharmony_ci * - If the buffer is not dirty, exit.
144162306a36Sopenharmony_ci * - If there some previous write going on, wait for it to finish (we can't
144262306a36Sopenharmony_ci *   have two writes on the same buffer simultaneously).
144362306a36Sopenharmony_ci * - Submit our write and don't wait on it. We set B_WRITING indicating
144462306a36Sopenharmony_ci *   that there is a write in progress.
144562306a36Sopenharmony_ci */
144662306a36Sopenharmony_cistatic void __write_dirty_buffer(struct dm_buffer *b,
144762306a36Sopenharmony_ci				 struct list_head *write_list)
144862306a36Sopenharmony_ci{
144962306a36Sopenharmony_ci	if (!test_bit(B_DIRTY, &b->state))
145062306a36Sopenharmony_ci		return;
145162306a36Sopenharmony_ci
145262306a36Sopenharmony_ci	clear_bit(B_DIRTY, &b->state);
145362306a36Sopenharmony_ci	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
145462306a36Sopenharmony_ci
145562306a36Sopenharmony_ci	b->write_start = b->dirty_start;
145662306a36Sopenharmony_ci	b->write_end = b->dirty_end;
145762306a36Sopenharmony_ci
145862306a36Sopenharmony_ci	if (!write_list)
145962306a36Sopenharmony_ci		submit_io(b, REQ_OP_WRITE, write_endio);
146062306a36Sopenharmony_ci	else
146162306a36Sopenharmony_ci		list_add_tail(&b->write_list, write_list);
146262306a36Sopenharmony_ci}
146362306a36Sopenharmony_ci
146462306a36Sopenharmony_cistatic void __flush_write_list(struct list_head *write_list)
146562306a36Sopenharmony_ci{
146662306a36Sopenharmony_ci	struct blk_plug plug;
146762306a36Sopenharmony_ci
146862306a36Sopenharmony_ci	blk_start_plug(&plug);
146962306a36Sopenharmony_ci	while (!list_empty(write_list)) {
147062306a36Sopenharmony_ci		struct dm_buffer *b =
147162306a36Sopenharmony_ci			list_entry(write_list->next, struct dm_buffer, write_list);
147262306a36Sopenharmony_ci		list_del(&b->write_list);
147362306a36Sopenharmony_ci		submit_io(b, REQ_OP_WRITE, write_endio);
147462306a36Sopenharmony_ci		cond_resched();
147562306a36Sopenharmony_ci	}
147662306a36Sopenharmony_ci	blk_finish_plug(&plug);
147762306a36Sopenharmony_ci}
147862306a36Sopenharmony_ci
147962306a36Sopenharmony_ci/*
148062306a36Sopenharmony_ci * Wait until any activity on the buffer finishes.  Possibly write the
148162306a36Sopenharmony_ci * buffer if it is dirty.  When this function finishes, there is no I/O
148262306a36Sopenharmony_ci * running on the buffer and the buffer is not dirty.
148362306a36Sopenharmony_ci */
148462306a36Sopenharmony_cistatic void __make_buffer_clean(struct dm_buffer *b)
148562306a36Sopenharmony_ci{
148662306a36Sopenharmony_ci	BUG_ON(atomic_read(&b->hold_count));
148762306a36Sopenharmony_ci
148862306a36Sopenharmony_ci	/* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
148962306a36Sopenharmony_ci	if (!smp_load_acquire(&b->state))	/* fast case */
149062306a36Sopenharmony_ci		return;
149162306a36Sopenharmony_ci
149262306a36Sopenharmony_ci	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
149362306a36Sopenharmony_ci	__write_dirty_buffer(b, NULL);
149462306a36Sopenharmony_ci	wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
149562306a36Sopenharmony_ci}
149662306a36Sopenharmony_ci
149762306a36Sopenharmony_cistatic enum evict_result is_clean(struct dm_buffer *b, void *context)
149862306a36Sopenharmony_ci{
149962306a36Sopenharmony_ci	struct dm_bufio_client *c = context;
150062306a36Sopenharmony_ci
150162306a36Sopenharmony_ci	/* These should never happen */
150262306a36Sopenharmony_ci	if (WARN_ON_ONCE(test_bit(B_WRITING, &b->state)))
150362306a36Sopenharmony_ci		return ER_DONT_EVICT;
150462306a36Sopenharmony_ci	if (WARN_ON_ONCE(test_bit(B_DIRTY, &b->state)))
150562306a36Sopenharmony_ci		return ER_DONT_EVICT;
150662306a36Sopenharmony_ci	if (WARN_ON_ONCE(b->list_mode != LIST_CLEAN))
150762306a36Sopenharmony_ci		return ER_DONT_EVICT;
150862306a36Sopenharmony_ci
150962306a36Sopenharmony_ci	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
151062306a36Sopenharmony_ci	    unlikely(test_bit(B_READING, &b->state)))
151162306a36Sopenharmony_ci		return ER_DONT_EVICT;
151262306a36Sopenharmony_ci
151362306a36Sopenharmony_ci	return ER_EVICT;
151462306a36Sopenharmony_ci}
151562306a36Sopenharmony_ci
151662306a36Sopenharmony_cistatic enum evict_result is_dirty(struct dm_buffer *b, void *context)
151762306a36Sopenharmony_ci{
151862306a36Sopenharmony_ci	/* These should never happen */
151962306a36Sopenharmony_ci	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
152062306a36Sopenharmony_ci		return ER_DONT_EVICT;
152162306a36Sopenharmony_ci	if (WARN_ON_ONCE(b->list_mode != LIST_DIRTY))
152262306a36Sopenharmony_ci		return ER_DONT_EVICT;
152362306a36Sopenharmony_ci
152462306a36Sopenharmony_ci	return ER_EVICT;
152562306a36Sopenharmony_ci}
152662306a36Sopenharmony_ci
152762306a36Sopenharmony_ci/*
152862306a36Sopenharmony_ci * Find some buffer that is not held by anybody, clean it, unlink it and
152962306a36Sopenharmony_ci * return it.
153062306a36Sopenharmony_ci */
153162306a36Sopenharmony_cistatic struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
153262306a36Sopenharmony_ci{
153362306a36Sopenharmony_ci	struct dm_buffer *b;
153462306a36Sopenharmony_ci
153562306a36Sopenharmony_ci	b = cache_evict(&c->cache, LIST_CLEAN, is_clean, c);
153662306a36Sopenharmony_ci	if (b) {
153762306a36Sopenharmony_ci		/* this also waits for pending reads */
153862306a36Sopenharmony_ci		__make_buffer_clean(b);
153962306a36Sopenharmony_ci		return b;
154062306a36Sopenharmony_ci	}
154162306a36Sopenharmony_ci
154262306a36Sopenharmony_ci	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
154362306a36Sopenharmony_ci		return NULL;
154462306a36Sopenharmony_ci
154562306a36Sopenharmony_ci	b = cache_evict(&c->cache, LIST_DIRTY, is_dirty, NULL);
154662306a36Sopenharmony_ci	if (b) {
154762306a36Sopenharmony_ci		__make_buffer_clean(b);
154862306a36Sopenharmony_ci		return b;
154962306a36Sopenharmony_ci	}
155062306a36Sopenharmony_ci
155162306a36Sopenharmony_ci	return NULL;
155262306a36Sopenharmony_ci}
155362306a36Sopenharmony_ci
155462306a36Sopenharmony_ci/*
155562306a36Sopenharmony_ci * Wait until some other threads free some buffer or release hold count on
155662306a36Sopenharmony_ci * some buffer.
155762306a36Sopenharmony_ci *
155862306a36Sopenharmony_ci * This function is entered with c->lock held, drops it and regains it
155962306a36Sopenharmony_ci * before exiting.
156062306a36Sopenharmony_ci */
156162306a36Sopenharmony_cistatic void __wait_for_free_buffer(struct dm_bufio_client *c)
156262306a36Sopenharmony_ci{
156362306a36Sopenharmony_ci	DECLARE_WAITQUEUE(wait, current);
156462306a36Sopenharmony_ci
156562306a36Sopenharmony_ci	add_wait_queue(&c->free_buffer_wait, &wait);
156662306a36Sopenharmony_ci	set_current_state(TASK_UNINTERRUPTIBLE);
156762306a36Sopenharmony_ci	dm_bufio_unlock(c);
156862306a36Sopenharmony_ci
156962306a36Sopenharmony_ci	/*
157062306a36Sopenharmony_ci	 * It's possible to miss a wake up event since we don't always
157162306a36Sopenharmony_ci	 * hold c->lock when wake_up is called.  So we have a timeout here,
157262306a36Sopenharmony_ci	 * just in case.
157362306a36Sopenharmony_ci	 */
157462306a36Sopenharmony_ci	io_schedule_timeout(5 * HZ);
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_ci	remove_wait_queue(&c->free_buffer_wait, &wait);
157762306a36Sopenharmony_ci
157862306a36Sopenharmony_ci	dm_bufio_lock(c);
157962306a36Sopenharmony_ci}
158062306a36Sopenharmony_ci
158162306a36Sopenharmony_cienum new_flag {
158262306a36Sopenharmony_ci	NF_FRESH = 0,
158362306a36Sopenharmony_ci	NF_READ = 1,
158462306a36Sopenharmony_ci	NF_GET = 2,
158562306a36Sopenharmony_ci	NF_PREFETCH = 3
158662306a36Sopenharmony_ci};
158762306a36Sopenharmony_ci
158862306a36Sopenharmony_ci/*
158962306a36Sopenharmony_ci * Allocate a new buffer. If the allocation is not possible, wait until
159062306a36Sopenharmony_ci * some other thread frees a buffer.
159162306a36Sopenharmony_ci *
159262306a36Sopenharmony_ci * May drop the lock and regain it.
159362306a36Sopenharmony_ci */
159462306a36Sopenharmony_cistatic struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
159562306a36Sopenharmony_ci{
159662306a36Sopenharmony_ci	struct dm_buffer *b;
159762306a36Sopenharmony_ci	bool tried_noio_alloc = false;
159862306a36Sopenharmony_ci
159962306a36Sopenharmony_ci	/*
160062306a36Sopenharmony_ci	 * dm-bufio is resistant to allocation failures (it just keeps
160162306a36Sopenharmony_ci	 * one buffer reserved in cases all the allocations fail).
160262306a36Sopenharmony_ci	 * So set flags to not try too hard:
160362306a36Sopenharmony_ci	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
160462306a36Sopenharmony_ci	 *		    mutex and wait ourselves.
160562306a36Sopenharmony_ci	 *	__GFP_NORETRY: don't retry and rather return failure
160662306a36Sopenharmony_ci	 *	__GFP_NOMEMALLOC: don't use emergency reserves
160762306a36Sopenharmony_ci	 *	__GFP_NOWARN: don't print a warning in case of failure
160862306a36Sopenharmony_ci	 *
160962306a36Sopenharmony_ci	 * For debugging, if we set the cache size to 1, no new buffers will
161062306a36Sopenharmony_ci	 * be allocated.
161162306a36Sopenharmony_ci	 */
161262306a36Sopenharmony_ci	while (1) {
161362306a36Sopenharmony_ci		if (dm_bufio_cache_size_latch != 1) {
161462306a36Sopenharmony_ci			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
161562306a36Sopenharmony_ci			if (b)
161662306a36Sopenharmony_ci				return b;
161762306a36Sopenharmony_ci		}
161862306a36Sopenharmony_ci
161962306a36Sopenharmony_ci		if (nf == NF_PREFETCH)
162062306a36Sopenharmony_ci			return NULL;
162162306a36Sopenharmony_ci
162262306a36Sopenharmony_ci		if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
162362306a36Sopenharmony_ci			dm_bufio_unlock(c);
162462306a36Sopenharmony_ci			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
162562306a36Sopenharmony_ci			dm_bufio_lock(c);
162662306a36Sopenharmony_ci			if (b)
162762306a36Sopenharmony_ci				return b;
162862306a36Sopenharmony_ci			tried_noio_alloc = true;
162962306a36Sopenharmony_ci		}
163062306a36Sopenharmony_ci
163162306a36Sopenharmony_ci		if (!list_empty(&c->reserved_buffers)) {
163262306a36Sopenharmony_ci			b = list_to_buffer(c->reserved_buffers.next);
163362306a36Sopenharmony_ci			list_del(&b->lru.list);
163462306a36Sopenharmony_ci			c->need_reserved_buffers++;
163562306a36Sopenharmony_ci
163662306a36Sopenharmony_ci			return b;
163762306a36Sopenharmony_ci		}
163862306a36Sopenharmony_ci
163962306a36Sopenharmony_ci		b = __get_unclaimed_buffer(c);
164062306a36Sopenharmony_ci		if (b)
164162306a36Sopenharmony_ci			return b;
164262306a36Sopenharmony_ci
164362306a36Sopenharmony_ci		__wait_for_free_buffer(c);
164462306a36Sopenharmony_ci	}
164562306a36Sopenharmony_ci}
164662306a36Sopenharmony_ci
164762306a36Sopenharmony_cistatic struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
164862306a36Sopenharmony_ci{
164962306a36Sopenharmony_ci	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
165062306a36Sopenharmony_ci
165162306a36Sopenharmony_ci	if (!b)
165262306a36Sopenharmony_ci		return NULL;
165362306a36Sopenharmony_ci
165462306a36Sopenharmony_ci	if (c->alloc_callback)
165562306a36Sopenharmony_ci		c->alloc_callback(b);
165662306a36Sopenharmony_ci
165762306a36Sopenharmony_ci	return b;
165862306a36Sopenharmony_ci}
165962306a36Sopenharmony_ci
166062306a36Sopenharmony_ci/*
166162306a36Sopenharmony_ci * Free a buffer and wake other threads waiting for free buffers.
166262306a36Sopenharmony_ci */
166362306a36Sopenharmony_cistatic void __free_buffer_wake(struct dm_buffer *b)
166462306a36Sopenharmony_ci{
166562306a36Sopenharmony_ci	struct dm_bufio_client *c = b->c;
166662306a36Sopenharmony_ci
166762306a36Sopenharmony_ci	b->block = -1;
166862306a36Sopenharmony_ci	if (!c->need_reserved_buffers)
166962306a36Sopenharmony_ci		free_buffer(b);
167062306a36Sopenharmony_ci	else {
167162306a36Sopenharmony_ci		list_add(&b->lru.list, &c->reserved_buffers);
167262306a36Sopenharmony_ci		c->need_reserved_buffers--;
167362306a36Sopenharmony_ci	}
167462306a36Sopenharmony_ci
167562306a36Sopenharmony_ci	/*
167662306a36Sopenharmony_ci	 * We hold the bufio lock here, so no one can add entries to the
167762306a36Sopenharmony_ci	 * wait queue anyway.
167862306a36Sopenharmony_ci	 */
167962306a36Sopenharmony_ci	if (unlikely(waitqueue_active(&c->free_buffer_wait)))
168062306a36Sopenharmony_ci		wake_up(&c->free_buffer_wait);
168162306a36Sopenharmony_ci}
168262306a36Sopenharmony_ci
168362306a36Sopenharmony_cistatic enum evict_result cleaned(struct dm_buffer *b, void *context)
168462306a36Sopenharmony_ci{
168562306a36Sopenharmony_ci	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
168662306a36Sopenharmony_ci		return ER_DONT_EVICT; /* should never happen */
168762306a36Sopenharmony_ci
168862306a36Sopenharmony_ci	if (test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state))
168962306a36Sopenharmony_ci		return ER_DONT_EVICT;
169062306a36Sopenharmony_ci	else
169162306a36Sopenharmony_ci		return ER_EVICT;
169262306a36Sopenharmony_ci}
169362306a36Sopenharmony_ci
169462306a36Sopenharmony_cistatic void __move_clean_buffers(struct dm_bufio_client *c)
169562306a36Sopenharmony_ci{
169662306a36Sopenharmony_ci	cache_mark_many(&c->cache, LIST_DIRTY, LIST_CLEAN, cleaned, NULL);
169762306a36Sopenharmony_ci}
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_cistruct write_context {
170062306a36Sopenharmony_ci	int no_wait;
170162306a36Sopenharmony_ci	struct list_head *write_list;
170262306a36Sopenharmony_ci};
170362306a36Sopenharmony_ci
170462306a36Sopenharmony_cistatic enum it_action write_one(struct dm_buffer *b, void *context)
170562306a36Sopenharmony_ci{
170662306a36Sopenharmony_ci	struct write_context *wc = context;
170762306a36Sopenharmony_ci
170862306a36Sopenharmony_ci	if (wc->no_wait && test_bit(B_WRITING, &b->state))
170962306a36Sopenharmony_ci		return IT_COMPLETE;
171062306a36Sopenharmony_ci
171162306a36Sopenharmony_ci	__write_dirty_buffer(b, wc->write_list);
171262306a36Sopenharmony_ci	return IT_NEXT;
171362306a36Sopenharmony_ci}
171462306a36Sopenharmony_ci
171562306a36Sopenharmony_cistatic void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
171662306a36Sopenharmony_ci					struct list_head *write_list)
171762306a36Sopenharmony_ci{
171862306a36Sopenharmony_ci	struct write_context wc = {.no_wait = no_wait, .write_list = write_list};
171962306a36Sopenharmony_ci
172062306a36Sopenharmony_ci	__move_clean_buffers(c);
172162306a36Sopenharmony_ci	cache_iterate(&c->cache, LIST_DIRTY, write_one, &wc);
172262306a36Sopenharmony_ci}
172362306a36Sopenharmony_ci
172462306a36Sopenharmony_ci/*
172562306a36Sopenharmony_ci * Check if we're over watermark.
172662306a36Sopenharmony_ci * If we are over threshold_buffers, start freeing buffers.
172762306a36Sopenharmony_ci * If we're over "limit_buffers", block until we get under the limit.
172862306a36Sopenharmony_ci */
172962306a36Sopenharmony_cistatic void __check_watermark(struct dm_bufio_client *c,
173062306a36Sopenharmony_ci			      struct list_head *write_list)
173162306a36Sopenharmony_ci{
173262306a36Sopenharmony_ci	if (cache_count(&c->cache, LIST_DIRTY) >
173362306a36Sopenharmony_ci	    cache_count(&c->cache, LIST_CLEAN) * DM_BUFIO_WRITEBACK_RATIO)
173462306a36Sopenharmony_ci		__write_dirty_buffers_async(c, 1, write_list);
173562306a36Sopenharmony_ci}
173662306a36Sopenharmony_ci
173762306a36Sopenharmony_ci/*
173862306a36Sopenharmony_ci *--------------------------------------------------------------
173962306a36Sopenharmony_ci * Getting a buffer
174062306a36Sopenharmony_ci *--------------------------------------------------------------
174162306a36Sopenharmony_ci */
174262306a36Sopenharmony_ci
174362306a36Sopenharmony_cistatic void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b)
174462306a36Sopenharmony_ci{
174562306a36Sopenharmony_ci	/*
174662306a36Sopenharmony_ci	 * Relying on waitqueue_active() is racey, but we sleep
174762306a36Sopenharmony_ci	 * with schedule_timeout anyway.
174862306a36Sopenharmony_ci	 */
174962306a36Sopenharmony_ci	if (cache_put(&c->cache, b) &&
175062306a36Sopenharmony_ci	    unlikely(waitqueue_active(&c->free_buffer_wait)))
175162306a36Sopenharmony_ci		wake_up(&c->free_buffer_wait);
175262306a36Sopenharmony_ci}
175362306a36Sopenharmony_ci
175462306a36Sopenharmony_ci/*
175562306a36Sopenharmony_ci * This assumes you have already checked the cache to see if the buffer
175662306a36Sopenharmony_ci * is already present (it will recheck after dropping the lock for allocation).
175762306a36Sopenharmony_ci */
175862306a36Sopenharmony_cistatic struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
175962306a36Sopenharmony_ci				     enum new_flag nf, int *need_submit,
176062306a36Sopenharmony_ci				     struct list_head *write_list)
176162306a36Sopenharmony_ci{
176262306a36Sopenharmony_ci	struct dm_buffer *b, *new_b = NULL;
176362306a36Sopenharmony_ci
176462306a36Sopenharmony_ci	*need_submit = 0;
176562306a36Sopenharmony_ci
176662306a36Sopenharmony_ci	/* This can't be called with NF_GET */
176762306a36Sopenharmony_ci	if (WARN_ON_ONCE(nf == NF_GET))
176862306a36Sopenharmony_ci		return NULL;
176962306a36Sopenharmony_ci
177062306a36Sopenharmony_ci	new_b = __alloc_buffer_wait(c, nf);
177162306a36Sopenharmony_ci	if (!new_b)
177262306a36Sopenharmony_ci		return NULL;
177362306a36Sopenharmony_ci
177462306a36Sopenharmony_ci	/*
177562306a36Sopenharmony_ci	 * We've had a period where the mutex was unlocked, so need to
177662306a36Sopenharmony_ci	 * recheck the buffer tree.
177762306a36Sopenharmony_ci	 */
177862306a36Sopenharmony_ci	b = cache_get(&c->cache, block);
177962306a36Sopenharmony_ci	if (b) {
178062306a36Sopenharmony_ci		__free_buffer_wake(new_b);
178162306a36Sopenharmony_ci		goto found_buffer;
178262306a36Sopenharmony_ci	}
178362306a36Sopenharmony_ci
178462306a36Sopenharmony_ci	__check_watermark(c, write_list);
178562306a36Sopenharmony_ci
178662306a36Sopenharmony_ci	b = new_b;
178762306a36Sopenharmony_ci	atomic_set(&b->hold_count, 1);
178862306a36Sopenharmony_ci	WRITE_ONCE(b->last_accessed, jiffies);
178962306a36Sopenharmony_ci	b->block = block;
179062306a36Sopenharmony_ci	b->read_error = 0;
179162306a36Sopenharmony_ci	b->write_error = 0;
179262306a36Sopenharmony_ci	b->list_mode = LIST_CLEAN;
179362306a36Sopenharmony_ci
179462306a36Sopenharmony_ci	if (nf == NF_FRESH)
179562306a36Sopenharmony_ci		b->state = 0;
179662306a36Sopenharmony_ci	else {
179762306a36Sopenharmony_ci		b->state = 1 << B_READING;
179862306a36Sopenharmony_ci		*need_submit = 1;
179962306a36Sopenharmony_ci	}
180062306a36Sopenharmony_ci
180162306a36Sopenharmony_ci	/*
180262306a36Sopenharmony_ci	 * We mustn't insert into the cache until the B_READING state
180362306a36Sopenharmony_ci	 * is set.  Otherwise another thread could get it and use
180462306a36Sopenharmony_ci	 * it before it had been read.
180562306a36Sopenharmony_ci	 */
180662306a36Sopenharmony_ci	cache_insert(&c->cache, b);
180762306a36Sopenharmony_ci
180862306a36Sopenharmony_ci	return b;
180962306a36Sopenharmony_ci
181062306a36Sopenharmony_cifound_buffer:
181162306a36Sopenharmony_ci	if (nf == NF_PREFETCH) {
181262306a36Sopenharmony_ci		cache_put_and_wake(c, b);
181362306a36Sopenharmony_ci		return NULL;
181462306a36Sopenharmony_ci	}
181562306a36Sopenharmony_ci
181662306a36Sopenharmony_ci	/*
181762306a36Sopenharmony_ci	 * Note: it is essential that we don't wait for the buffer to be
181862306a36Sopenharmony_ci	 * read if dm_bufio_get function is used. Both dm_bufio_get and
181962306a36Sopenharmony_ci	 * dm_bufio_prefetch can be used in the driver request routine.
182062306a36Sopenharmony_ci	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
182162306a36Sopenharmony_ci	 * the same buffer, it would deadlock if we waited.
182262306a36Sopenharmony_ci	 */
182362306a36Sopenharmony_ci	if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
182462306a36Sopenharmony_ci		cache_put_and_wake(c, b);
182562306a36Sopenharmony_ci		return NULL;
182662306a36Sopenharmony_ci	}
182762306a36Sopenharmony_ci
182862306a36Sopenharmony_ci	return b;
182962306a36Sopenharmony_ci}
183062306a36Sopenharmony_ci
183162306a36Sopenharmony_ci/*
183262306a36Sopenharmony_ci * The endio routine for reading: set the error, clear the bit and wake up
183362306a36Sopenharmony_ci * anyone waiting on the buffer.
183462306a36Sopenharmony_ci */
183562306a36Sopenharmony_cistatic void read_endio(struct dm_buffer *b, blk_status_t status)
183662306a36Sopenharmony_ci{
183762306a36Sopenharmony_ci	b->read_error = status;
183862306a36Sopenharmony_ci
183962306a36Sopenharmony_ci	BUG_ON(!test_bit(B_READING, &b->state));
184062306a36Sopenharmony_ci
184162306a36Sopenharmony_ci	smp_mb__before_atomic();
184262306a36Sopenharmony_ci	clear_bit(B_READING, &b->state);
184362306a36Sopenharmony_ci	smp_mb__after_atomic();
184462306a36Sopenharmony_ci
184562306a36Sopenharmony_ci	wake_up_bit(&b->state, B_READING);
184662306a36Sopenharmony_ci}
184762306a36Sopenharmony_ci
184862306a36Sopenharmony_ci/*
184962306a36Sopenharmony_ci * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
185062306a36Sopenharmony_ci * functions is similar except that dm_bufio_new doesn't read the
185162306a36Sopenharmony_ci * buffer from the disk (assuming that the caller overwrites all the data
185262306a36Sopenharmony_ci * and uses dm_bufio_mark_buffer_dirty to write new data back).
185362306a36Sopenharmony_ci */
185462306a36Sopenharmony_cistatic void *new_read(struct dm_bufio_client *c, sector_t block,
185562306a36Sopenharmony_ci		      enum new_flag nf, struct dm_buffer **bp)
185662306a36Sopenharmony_ci{
185762306a36Sopenharmony_ci	int need_submit = 0;
185862306a36Sopenharmony_ci	struct dm_buffer *b;
185962306a36Sopenharmony_ci
186062306a36Sopenharmony_ci	LIST_HEAD(write_list);
186162306a36Sopenharmony_ci
186262306a36Sopenharmony_ci	*bp = NULL;
186362306a36Sopenharmony_ci
186462306a36Sopenharmony_ci	/*
186562306a36Sopenharmony_ci	 * Fast path, hopefully the block is already in the cache.  No need
186662306a36Sopenharmony_ci	 * to get the client lock for this.
186762306a36Sopenharmony_ci	 */
186862306a36Sopenharmony_ci	b = cache_get(&c->cache, block);
186962306a36Sopenharmony_ci	if (b) {
187062306a36Sopenharmony_ci		if (nf == NF_PREFETCH) {
187162306a36Sopenharmony_ci			cache_put_and_wake(c, b);
187262306a36Sopenharmony_ci			return NULL;
187362306a36Sopenharmony_ci		}
187462306a36Sopenharmony_ci
187562306a36Sopenharmony_ci		/*
187662306a36Sopenharmony_ci		 * Note: it is essential that we don't wait for the buffer to be
187762306a36Sopenharmony_ci		 * read if dm_bufio_get function is used. Both dm_bufio_get and
187862306a36Sopenharmony_ci		 * dm_bufio_prefetch can be used in the driver request routine.
187962306a36Sopenharmony_ci		 * If the user called both dm_bufio_prefetch and dm_bufio_get on
188062306a36Sopenharmony_ci		 * the same buffer, it would deadlock if we waited.
188162306a36Sopenharmony_ci		 */
188262306a36Sopenharmony_ci		if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
188362306a36Sopenharmony_ci			cache_put_and_wake(c, b);
188462306a36Sopenharmony_ci			return NULL;
188562306a36Sopenharmony_ci		}
188662306a36Sopenharmony_ci	}
188762306a36Sopenharmony_ci
188862306a36Sopenharmony_ci	if (!b) {
188962306a36Sopenharmony_ci		if (nf == NF_GET)
189062306a36Sopenharmony_ci			return NULL;
189162306a36Sopenharmony_ci
189262306a36Sopenharmony_ci		dm_bufio_lock(c);
189362306a36Sopenharmony_ci		b = __bufio_new(c, block, nf, &need_submit, &write_list);
189462306a36Sopenharmony_ci		dm_bufio_unlock(c);
189562306a36Sopenharmony_ci	}
189662306a36Sopenharmony_ci
189762306a36Sopenharmony_ci#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
189862306a36Sopenharmony_ci	if (b && (atomic_read(&b->hold_count) == 1))
189962306a36Sopenharmony_ci		buffer_record_stack(b);
190062306a36Sopenharmony_ci#endif
190162306a36Sopenharmony_ci
190262306a36Sopenharmony_ci	__flush_write_list(&write_list);
190362306a36Sopenharmony_ci
190462306a36Sopenharmony_ci	if (!b)
190562306a36Sopenharmony_ci		return NULL;
190662306a36Sopenharmony_ci
190762306a36Sopenharmony_ci	if (need_submit)
190862306a36Sopenharmony_ci		submit_io(b, REQ_OP_READ, read_endio);
190962306a36Sopenharmony_ci
191062306a36Sopenharmony_ci	if (nf != NF_GET)	/* we already tested this condition above */
191162306a36Sopenharmony_ci		wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
191262306a36Sopenharmony_ci
191362306a36Sopenharmony_ci	if (b->read_error) {
191462306a36Sopenharmony_ci		int error = blk_status_to_errno(b->read_error);
191562306a36Sopenharmony_ci
191662306a36Sopenharmony_ci		dm_bufio_release(b);
191762306a36Sopenharmony_ci
191862306a36Sopenharmony_ci		return ERR_PTR(error);
191962306a36Sopenharmony_ci	}
192062306a36Sopenharmony_ci
192162306a36Sopenharmony_ci	*bp = b;
192262306a36Sopenharmony_ci
192362306a36Sopenharmony_ci	return b->data;
192462306a36Sopenharmony_ci}
192562306a36Sopenharmony_ci
192662306a36Sopenharmony_civoid *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
192762306a36Sopenharmony_ci		   struct dm_buffer **bp)
192862306a36Sopenharmony_ci{
192962306a36Sopenharmony_ci	return new_read(c, block, NF_GET, bp);
193062306a36Sopenharmony_ci}
193162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_get);
193262306a36Sopenharmony_ci
193362306a36Sopenharmony_civoid *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
193462306a36Sopenharmony_ci		    struct dm_buffer **bp)
193562306a36Sopenharmony_ci{
193662306a36Sopenharmony_ci	if (WARN_ON_ONCE(dm_bufio_in_request()))
193762306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
193862306a36Sopenharmony_ci
193962306a36Sopenharmony_ci	return new_read(c, block, NF_READ, bp);
194062306a36Sopenharmony_ci}
194162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_read);
194262306a36Sopenharmony_ci
194362306a36Sopenharmony_civoid *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
194462306a36Sopenharmony_ci		   struct dm_buffer **bp)
194562306a36Sopenharmony_ci{
194662306a36Sopenharmony_ci	if (WARN_ON_ONCE(dm_bufio_in_request()))
194762306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
194862306a36Sopenharmony_ci
194962306a36Sopenharmony_ci	return new_read(c, block, NF_FRESH, bp);
195062306a36Sopenharmony_ci}
195162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_new);
195262306a36Sopenharmony_ci
195362306a36Sopenharmony_civoid dm_bufio_prefetch(struct dm_bufio_client *c,
195462306a36Sopenharmony_ci		       sector_t block, unsigned int n_blocks)
195562306a36Sopenharmony_ci{
195662306a36Sopenharmony_ci	struct blk_plug plug;
195762306a36Sopenharmony_ci
195862306a36Sopenharmony_ci	LIST_HEAD(write_list);
195962306a36Sopenharmony_ci
196062306a36Sopenharmony_ci	if (WARN_ON_ONCE(dm_bufio_in_request()))
196162306a36Sopenharmony_ci		return; /* should never happen */
196262306a36Sopenharmony_ci
196362306a36Sopenharmony_ci	blk_start_plug(&plug);
196462306a36Sopenharmony_ci
196562306a36Sopenharmony_ci	for (; n_blocks--; block++) {
196662306a36Sopenharmony_ci		int need_submit;
196762306a36Sopenharmony_ci		struct dm_buffer *b;
196862306a36Sopenharmony_ci
196962306a36Sopenharmony_ci		b = cache_get(&c->cache, block);
197062306a36Sopenharmony_ci		if (b) {
197162306a36Sopenharmony_ci			/* already in cache */
197262306a36Sopenharmony_ci			cache_put_and_wake(c, b);
197362306a36Sopenharmony_ci			continue;
197462306a36Sopenharmony_ci		}
197562306a36Sopenharmony_ci
197662306a36Sopenharmony_ci		dm_bufio_lock(c);
197762306a36Sopenharmony_ci		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
197862306a36Sopenharmony_ci				&write_list);
197962306a36Sopenharmony_ci		if (unlikely(!list_empty(&write_list))) {
198062306a36Sopenharmony_ci			dm_bufio_unlock(c);
198162306a36Sopenharmony_ci			blk_finish_plug(&plug);
198262306a36Sopenharmony_ci			__flush_write_list(&write_list);
198362306a36Sopenharmony_ci			blk_start_plug(&plug);
198462306a36Sopenharmony_ci			dm_bufio_lock(c);
198562306a36Sopenharmony_ci		}
198662306a36Sopenharmony_ci		if (unlikely(b != NULL)) {
198762306a36Sopenharmony_ci			dm_bufio_unlock(c);
198862306a36Sopenharmony_ci
198962306a36Sopenharmony_ci			if (need_submit)
199062306a36Sopenharmony_ci				submit_io(b, REQ_OP_READ, read_endio);
199162306a36Sopenharmony_ci			dm_bufio_release(b);
199262306a36Sopenharmony_ci
199362306a36Sopenharmony_ci			cond_resched();
199462306a36Sopenharmony_ci
199562306a36Sopenharmony_ci			if (!n_blocks)
199662306a36Sopenharmony_ci				goto flush_plug;
199762306a36Sopenharmony_ci			dm_bufio_lock(c);
199862306a36Sopenharmony_ci		}
199962306a36Sopenharmony_ci		dm_bufio_unlock(c);
200062306a36Sopenharmony_ci	}
200162306a36Sopenharmony_ci
200262306a36Sopenharmony_ciflush_plug:
200362306a36Sopenharmony_ci	blk_finish_plug(&plug);
200462306a36Sopenharmony_ci}
200562306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_prefetch);
200662306a36Sopenharmony_ci
200762306a36Sopenharmony_civoid dm_bufio_release(struct dm_buffer *b)
200862306a36Sopenharmony_ci{
200962306a36Sopenharmony_ci	struct dm_bufio_client *c = b->c;
201062306a36Sopenharmony_ci
201162306a36Sopenharmony_ci	/*
201262306a36Sopenharmony_ci	 * If there were errors on the buffer, and the buffer is not
201362306a36Sopenharmony_ci	 * to be written, free the buffer. There is no point in caching
201462306a36Sopenharmony_ci	 * invalid buffer.
201562306a36Sopenharmony_ci	 */
201662306a36Sopenharmony_ci	if ((b->read_error || b->write_error) &&
201762306a36Sopenharmony_ci	    !test_bit_acquire(B_READING, &b->state) &&
201862306a36Sopenharmony_ci	    !test_bit(B_WRITING, &b->state) &&
201962306a36Sopenharmony_ci	    !test_bit(B_DIRTY, &b->state)) {
202062306a36Sopenharmony_ci		dm_bufio_lock(c);
202162306a36Sopenharmony_ci
202262306a36Sopenharmony_ci		/* cache remove can fail if there are other holders */
202362306a36Sopenharmony_ci		if (cache_remove(&c->cache, b)) {
202462306a36Sopenharmony_ci			__free_buffer_wake(b);
202562306a36Sopenharmony_ci			dm_bufio_unlock(c);
202662306a36Sopenharmony_ci			return;
202762306a36Sopenharmony_ci		}
202862306a36Sopenharmony_ci
202962306a36Sopenharmony_ci		dm_bufio_unlock(c);
203062306a36Sopenharmony_ci	}
203162306a36Sopenharmony_ci
203262306a36Sopenharmony_ci	cache_put_and_wake(c, b);
203362306a36Sopenharmony_ci}
203462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_release);
203562306a36Sopenharmony_ci
203662306a36Sopenharmony_civoid dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
203762306a36Sopenharmony_ci					unsigned int start, unsigned int end)
203862306a36Sopenharmony_ci{
203962306a36Sopenharmony_ci	struct dm_bufio_client *c = b->c;
204062306a36Sopenharmony_ci
204162306a36Sopenharmony_ci	BUG_ON(start >= end);
204262306a36Sopenharmony_ci	BUG_ON(end > b->c->block_size);
204362306a36Sopenharmony_ci
204462306a36Sopenharmony_ci	dm_bufio_lock(c);
204562306a36Sopenharmony_ci
204662306a36Sopenharmony_ci	BUG_ON(test_bit(B_READING, &b->state));
204762306a36Sopenharmony_ci
204862306a36Sopenharmony_ci	if (!test_and_set_bit(B_DIRTY, &b->state)) {
204962306a36Sopenharmony_ci		b->dirty_start = start;
205062306a36Sopenharmony_ci		b->dirty_end = end;
205162306a36Sopenharmony_ci		cache_mark(&c->cache, b, LIST_DIRTY);
205262306a36Sopenharmony_ci	} else {
205362306a36Sopenharmony_ci		if (start < b->dirty_start)
205462306a36Sopenharmony_ci			b->dirty_start = start;
205562306a36Sopenharmony_ci		if (end > b->dirty_end)
205662306a36Sopenharmony_ci			b->dirty_end = end;
205762306a36Sopenharmony_ci	}
205862306a36Sopenharmony_ci
205962306a36Sopenharmony_ci	dm_bufio_unlock(c);
206062306a36Sopenharmony_ci}
206162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
206262306a36Sopenharmony_ci
206362306a36Sopenharmony_civoid dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
206462306a36Sopenharmony_ci{
206562306a36Sopenharmony_ci	dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
206662306a36Sopenharmony_ci}
206762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
206862306a36Sopenharmony_ci
206962306a36Sopenharmony_civoid dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
207062306a36Sopenharmony_ci{
207162306a36Sopenharmony_ci	LIST_HEAD(write_list);
207262306a36Sopenharmony_ci
207362306a36Sopenharmony_ci	if (WARN_ON_ONCE(dm_bufio_in_request()))
207462306a36Sopenharmony_ci		return; /* should never happen */
207562306a36Sopenharmony_ci
207662306a36Sopenharmony_ci	dm_bufio_lock(c);
207762306a36Sopenharmony_ci	__write_dirty_buffers_async(c, 0, &write_list);
207862306a36Sopenharmony_ci	dm_bufio_unlock(c);
207962306a36Sopenharmony_ci	__flush_write_list(&write_list);
208062306a36Sopenharmony_ci}
208162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
208262306a36Sopenharmony_ci
208362306a36Sopenharmony_ci/*
208462306a36Sopenharmony_ci * For performance, it is essential that the buffers are written asynchronously
208562306a36Sopenharmony_ci * and simultaneously (so that the block layer can merge the writes) and then
208662306a36Sopenharmony_ci * waited upon.
208762306a36Sopenharmony_ci *
208862306a36Sopenharmony_ci * Finally, we flush hardware disk cache.
208962306a36Sopenharmony_ci */
209062306a36Sopenharmony_cistatic bool is_writing(struct lru_entry *e, void *context)
209162306a36Sopenharmony_ci{
209262306a36Sopenharmony_ci	struct dm_buffer *b = le_to_buffer(e);
209362306a36Sopenharmony_ci
209462306a36Sopenharmony_ci	return test_bit(B_WRITING, &b->state);
209562306a36Sopenharmony_ci}
209662306a36Sopenharmony_ci
209762306a36Sopenharmony_ciint dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
209862306a36Sopenharmony_ci{
209962306a36Sopenharmony_ci	int a, f;
210062306a36Sopenharmony_ci	unsigned long nr_buffers;
210162306a36Sopenharmony_ci	struct lru_entry *e;
210262306a36Sopenharmony_ci	struct lru_iter it;
210362306a36Sopenharmony_ci
210462306a36Sopenharmony_ci	LIST_HEAD(write_list);
210562306a36Sopenharmony_ci
210662306a36Sopenharmony_ci	dm_bufio_lock(c);
210762306a36Sopenharmony_ci	__write_dirty_buffers_async(c, 0, &write_list);
210862306a36Sopenharmony_ci	dm_bufio_unlock(c);
210962306a36Sopenharmony_ci	__flush_write_list(&write_list);
211062306a36Sopenharmony_ci	dm_bufio_lock(c);
211162306a36Sopenharmony_ci
211262306a36Sopenharmony_ci	nr_buffers = cache_count(&c->cache, LIST_DIRTY);
211362306a36Sopenharmony_ci	lru_iter_begin(&c->cache.lru[LIST_DIRTY], &it);
211462306a36Sopenharmony_ci	while ((e = lru_iter_next(&it, is_writing, c))) {
211562306a36Sopenharmony_ci		struct dm_buffer *b = le_to_buffer(e);
211662306a36Sopenharmony_ci		__cache_inc_buffer(b);
211762306a36Sopenharmony_ci
211862306a36Sopenharmony_ci		BUG_ON(test_bit(B_READING, &b->state));
211962306a36Sopenharmony_ci
212062306a36Sopenharmony_ci		if (nr_buffers) {
212162306a36Sopenharmony_ci			nr_buffers--;
212262306a36Sopenharmony_ci			dm_bufio_unlock(c);
212362306a36Sopenharmony_ci			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
212462306a36Sopenharmony_ci			dm_bufio_lock(c);
212562306a36Sopenharmony_ci		} else {
212662306a36Sopenharmony_ci			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
212762306a36Sopenharmony_ci		}
212862306a36Sopenharmony_ci
212962306a36Sopenharmony_ci		if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state))
213062306a36Sopenharmony_ci			cache_mark(&c->cache, b, LIST_CLEAN);
213162306a36Sopenharmony_ci
213262306a36Sopenharmony_ci		cache_put_and_wake(c, b);
213362306a36Sopenharmony_ci
213462306a36Sopenharmony_ci		cond_resched();
213562306a36Sopenharmony_ci	}
213662306a36Sopenharmony_ci	lru_iter_end(&it);
213762306a36Sopenharmony_ci
213862306a36Sopenharmony_ci	wake_up(&c->free_buffer_wait);
213962306a36Sopenharmony_ci	dm_bufio_unlock(c);
214062306a36Sopenharmony_ci
214162306a36Sopenharmony_ci	a = xchg(&c->async_write_error, 0);
214262306a36Sopenharmony_ci	f = dm_bufio_issue_flush(c);
214362306a36Sopenharmony_ci	if (a)
214462306a36Sopenharmony_ci		return a;
214562306a36Sopenharmony_ci
214662306a36Sopenharmony_ci	return f;
214762306a36Sopenharmony_ci}
214862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
214962306a36Sopenharmony_ci
215062306a36Sopenharmony_ci/*
215162306a36Sopenharmony_ci * Use dm-io to send an empty barrier to flush the device.
215262306a36Sopenharmony_ci */
215362306a36Sopenharmony_ciint dm_bufio_issue_flush(struct dm_bufio_client *c)
215462306a36Sopenharmony_ci{
215562306a36Sopenharmony_ci	struct dm_io_request io_req = {
215662306a36Sopenharmony_ci		.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
215762306a36Sopenharmony_ci		.mem.type = DM_IO_KMEM,
215862306a36Sopenharmony_ci		.mem.ptr.addr = NULL,
215962306a36Sopenharmony_ci		.client = c->dm_io,
216062306a36Sopenharmony_ci	};
216162306a36Sopenharmony_ci	struct dm_io_region io_reg = {
216262306a36Sopenharmony_ci		.bdev = c->bdev,
216362306a36Sopenharmony_ci		.sector = 0,
216462306a36Sopenharmony_ci		.count = 0,
216562306a36Sopenharmony_ci	};
216662306a36Sopenharmony_ci
216762306a36Sopenharmony_ci	if (WARN_ON_ONCE(dm_bufio_in_request()))
216862306a36Sopenharmony_ci		return -EINVAL;
216962306a36Sopenharmony_ci
217062306a36Sopenharmony_ci	return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT);
217162306a36Sopenharmony_ci}
217262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
217362306a36Sopenharmony_ci
217462306a36Sopenharmony_ci/*
217562306a36Sopenharmony_ci * Use dm-io to send a discard request to flush the device.
217662306a36Sopenharmony_ci */
217762306a36Sopenharmony_ciint dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
217862306a36Sopenharmony_ci{
217962306a36Sopenharmony_ci	struct dm_io_request io_req = {
218062306a36Sopenharmony_ci		.bi_opf = REQ_OP_DISCARD | REQ_SYNC,
218162306a36Sopenharmony_ci		.mem.type = DM_IO_KMEM,
218262306a36Sopenharmony_ci		.mem.ptr.addr = NULL,
218362306a36Sopenharmony_ci		.client = c->dm_io,
218462306a36Sopenharmony_ci	};
218562306a36Sopenharmony_ci	struct dm_io_region io_reg = {
218662306a36Sopenharmony_ci		.bdev = c->bdev,
218762306a36Sopenharmony_ci		.sector = block_to_sector(c, block),
218862306a36Sopenharmony_ci		.count = block_to_sector(c, count),
218962306a36Sopenharmony_ci	};
219062306a36Sopenharmony_ci
219162306a36Sopenharmony_ci	if (WARN_ON_ONCE(dm_bufio_in_request()))
219262306a36Sopenharmony_ci		return -EINVAL; /* discards are optional */
219362306a36Sopenharmony_ci
219462306a36Sopenharmony_ci	return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT);
219562306a36Sopenharmony_ci}
219662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
219762306a36Sopenharmony_ci
219862306a36Sopenharmony_cistatic bool forget_buffer(struct dm_bufio_client *c, sector_t block)
219962306a36Sopenharmony_ci{
220062306a36Sopenharmony_ci	struct dm_buffer *b;
220162306a36Sopenharmony_ci
220262306a36Sopenharmony_ci	b = cache_get(&c->cache, block);
220362306a36Sopenharmony_ci	if (b) {
220462306a36Sopenharmony_ci		if (likely(!smp_load_acquire(&b->state))) {
220562306a36Sopenharmony_ci			if (cache_remove(&c->cache, b))
220662306a36Sopenharmony_ci				__free_buffer_wake(b);
220762306a36Sopenharmony_ci			else
220862306a36Sopenharmony_ci				cache_put_and_wake(c, b);
220962306a36Sopenharmony_ci		} else {
221062306a36Sopenharmony_ci			cache_put_and_wake(c, b);
221162306a36Sopenharmony_ci		}
221262306a36Sopenharmony_ci	}
221362306a36Sopenharmony_ci
221462306a36Sopenharmony_ci	return b ? true : false;
221562306a36Sopenharmony_ci}
221662306a36Sopenharmony_ci
221762306a36Sopenharmony_ci/*
221862306a36Sopenharmony_ci * Free the given buffer.
221962306a36Sopenharmony_ci *
222062306a36Sopenharmony_ci * This is just a hint, if the buffer is in use or dirty, this function
222162306a36Sopenharmony_ci * does nothing.
222262306a36Sopenharmony_ci */
222362306a36Sopenharmony_civoid dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
222462306a36Sopenharmony_ci{
222562306a36Sopenharmony_ci	dm_bufio_lock(c);
222662306a36Sopenharmony_ci	forget_buffer(c, block);
222762306a36Sopenharmony_ci	dm_bufio_unlock(c);
222862306a36Sopenharmony_ci}
222962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_forget);
223062306a36Sopenharmony_ci
223162306a36Sopenharmony_cistatic enum evict_result idle(struct dm_buffer *b, void *context)
223262306a36Sopenharmony_ci{
223362306a36Sopenharmony_ci	return b->state ? ER_DONT_EVICT : ER_EVICT;
223462306a36Sopenharmony_ci}
223562306a36Sopenharmony_ci
223662306a36Sopenharmony_civoid dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
223762306a36Sopenharmony_ci{
223862306a36Sopenharmony_ci	dm_bufio_lock(c);
223962306a36Sopenharmony_ci	cache_remove_range(&c->cache, block, block + n_blocks, idle, __free_buffer_wake);
224062306a36Sopenharmony_ci	dm_bufio_unlock(c);
224162306a36Sopenharmony_ci}
224262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
224362306a36Sopenharmony_ci
224462306a36Sopenharmony_civoid dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned int n)
224562306a36Sopenharmony_ci{
224662306a36Sopenharmony_ci	c->minimum_buffers = n;
224762306a36Sopenharmony_ci}
224862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
224962306a36Sopenharmony_ci
225062306a36Sopenharmony_ciunsigned int dm_bufio_get_block_size(struct dm_bufio_client *c)
225162306a36Sopenharmony_ci{
225262306a36Sopenharmony_ci	return c->block_size;
225362306a36Sopenharmony_ci}
225462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
225562306a36Sopenharmony_ci
225662306a36Sopenharmony_cisector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
225762306a36Sopenharmony_ci{
225862306a36Sopenharmony_ci	sector_t s = bdev_nr_sectors(c->bdev);
225962306a36Sopenharmony_ci
226062306a36Sopenharmony_ci	if (s >= c->start)
226162306a36Sopenharmony_ci		s -= c->start;
226262306a36Sopenharmony_ci	else
226362306a36Sopenharmony_ci		s = 0;
226462306a36Sopenharmony_ci	if (likely(c->sectors_per_block_bits >= 0))
226562306a36Sopenharmony_ci		s >>= c->sectors_per_block_bits;
226662306a36Sopenharmony_ci	else
226762306a36Sopenharmony_ci		sector_div(s, c->block_size >> SECTOR_SHIFT);
226862306a36Sopenharmony_ci	return s;
226962306a36Sopenharmony_ci}
227062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
227162306a36Sopenharmony_ci
227262306a36Sopenharmony_cistruct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
227362306a36Sopenharmony_ci{
227462306a36Sopenharmony_ci	return c->dm_io;
227562306a36Sopenharmony_ci}
227662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
227762306a36Sopenharmony_ci
227862306a36Sopenharmony_cisector_t dm_bufio_get_block_number(struct dm_buffer *b)
227962306a36Sopenharmony_ci{
228062306a36Sopenharmony_ci	return b->block;
228162306a36Sopenharmony_ci}
228262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
228362306a36Sopenharmony_ci
228462306a36Sopenharmony_civoid *dm_bufio_get_block_data(struct dm_buffer *b)
228562306a36Sopenharmony_ci{
228662306a36Sopenharmony_ci	return b->data;
228762306a36Sopenharmony_ci}
228862306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
228962306a36Sopenharmony_ci
229062306a36Sopenharmony_civoid *dm_bufio_get_aux_data(struct dm_buffer *b)
229162306a36Sopenharmony_ci{
229262306a36Sopenharmony_ci	return b + 1;
229362306a36Sopenharmony_ci}
229462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
229562306a36Sopenharmony_ci
229662306a36Sopenharmony_cistruct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
229762306a36Sopenharmony_ci{
229862306a36Sopenharmony_ci	return b->c;
229962306a36Sopenharmony_ci}
230062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_get_client);
230162306a36Sopenharmony_ci
230262306a36Sopenharmony_cistatic enum it_action warn_leak(struct dm_buffer *b, void *context)
230362306a36Sopenharmony_ci{
230462306a36Sopenharmony_ci	bool *warned = context;
230562306a36Sopenharmony_ci
230662306a36Sopenharmony_ci	WARN_ON(!(*warned));
230762306a36Sopenharmony_ci	*warned = true;
230862306a36Sopenharmony_ci	DMERR("leaked buffer %llx, hold count %u, list %d",
230962306a36Sopenharmony_ci	      (unsigned long long)b->block, atomic_read(&b->hold_count), b->list_mode);
231062306a36Sopenharmony_ci#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
231162306a36Sopenharmony_ci	stack_trace_print(b->stack_entries, b->stack_len, 1);
231262306a36Sopenharmony_ci	/* mark unclaimed to avoid WARN_ON at end of drop_buffers() */
231362306a36Sopenharmony_ci	atomic_set(&b->hold_count, 0);
231462306a36Sopenharmony_ci#endif
231562306a36Sopenharmony_ci	return IT_NEXT;
231662306a36Sopenharmony_ci}
231762306a36Sopenharmony_ci
231862306a36Sopenharmony_cistatic void drop_buffers(struct dm_bufio_client *c)
231962306a36Sopenharmony_ci{
232062306a36Sopenharmony_ci	int i;
232162306a36Sopenharmony_ci	struct dm_buffer *b;
232262306a36Sopenharmony_ci
232362306a36Sopenharmony_ci	if (WARN_ON(dm_bufio_in_request()))
232462306a36Sopenharmony_ci		return; /* should never happen */
232562306a36Sopenharmony_ci
232662306a36Sopenharmony_ci	/*
232762306a36Sopenharmony_ci	 * An optimization so that the buffers are not written one-by-one.
232862306a36Sopenharmony_ci	 */
232962306a36Sopenharmony_ci	dm_bufio_write_dirty_buffers_async(c);
233062306a36Sopenharmony_ci
233162306a36Sopenharmony_ci	dm_bufio_lock(c);
233262306a36Sopenharmony_ci
233362306a36Sopenharmony_ci	while ((b = __get_unclaimed_buffer(c)))
233462306a36Sopenharmony_ci		__free_buffer_wake(b);
233562306a36Sopenharmony_ci
233662306a36Sopenharmony_ci	for (i = 0; i < LIST_SIZE; i++) {
233762306a36Sopenharmony_ci		bool warned = false;
233862306a36Sopenharmony_ci
233962306a36Sopenharmony_ci		cache_iterate(&c->cache, i, warn_leak, &warned);
234062306a36Sopenharmony_ci	}
234162306a36Sopenharmony_ci
234262306a36Sopenharmony_ci#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
234362306a36Sopenharmony_ci	while ((b = __get_unclaimed_buffer(c)))
234462306a36Sopenharmony_ci		__free_buffer_wake(b);
234562306a36Sopenharmony_ci#endif
234662306a36Sopenharmony_ci
234762306a36Sopenharmony_ci	for (i = 0; i < LIST_SIZE; i++)
234862306a36Sopenharmony_ci		WARN_ON(cache_count(&c->cache, i));
234962306a36Sopenharmony_ci
235062306a36Sopenharmony_ci	dm_bufio_unlock(c);
235162306a36Sopenharmony_ci}
235262306a36Sopenharmony_ci
235362306a36Sopenharmony_cistatic unsigned long get_retain_buffers(struct dm_bufio_client *c)
235462306a36Sopenharmony_ci{
235562306a36Sopenharmony_ci	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
235662306a36Sopenharmony_ci
235762306a36Sopenharmony_ci	if (likely(c->sectors_per_block_bits >= 0))
235862306a36Sopenharmony_ci		retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
235962306a36Sopenharmony_ci	else
236062306a36Sopenharmony_ci		retain_bytes /= c->block_size;
236162306a36Sopenharmony_ci
236262306a36Sopenharmony_ci	return retain_bytes;
236362306a36Sopenharmony_ci}
236462306a36Sopenharmony_ci
236562306a36Sopenharmony_cistatic void __scan(struct dm_bufio_client *c)
236662306a36Sopenharmony_ci{
236762306a36Sopenharmony_ci	int l;
236862306a36Sopenharmony_ci	struct dm_buffer *b;
236962306a36Sopenharmony_ci	unsigned long freed = 0;
237062306a36Sopenharmony_ci	unsigned long retain_target = get_retain_buffers(c);
237162306a36Sopenharmony_ci	unsigned long count = cache_total(&c->cache);
237262306a36Sopenharmony_ci
237362306a36Sopenharmony_ci	for (l = 0; l < LIST_SIZE; l++) {
237462306a36Sopenharmony_ci		while (true) {
237562306a36Sopenharmony_ci			if (count - freed <= retain_target)
237662306a36Sopenharmony_ci				atomic_long_set(&c->need_shrink, 0);
237762306a36Sopenharmony_ci			if (!atomic_long_read(&c->need_shrink))
237862306a36Sopenharmony_ci				break;
237962306a36Sopenharmony_ci
238062306a36Sopenharmony_ci			b = cache_evict(&c->cache, l,
238162306a36Sopenharmony_ci					l == LIST_CLEAN ? is_clean : is_dirty, c);
238262306a36Sopenharmony_ci			if (!b)
238362306a36Sopenharmony_ci				break;
238462306a36Sopenharmony_ci
238562306a36Sopenharmony_ci			__make_buffer_clean(b);
238662306a36Sopenharmony_ci			__free_buffer_wake(b);
238762306a36Sopenharmony_ci
238862306a36Sopenharmony_ci			atomic_long_dec(&c->need_shrink);
238962306a36Sopenharmony_ci			freed++;
239062306a36Sopenharmony_ci			cond_resched();
239162306a36Sopenharmony_ci		}
239262306a36Sopenharmony_ci	}
239362306a36Sopenharmony_ci}
239462306a36Sopenharmony_ci
239562306a36Sopenharmony_cistatic void shrink_work(struct work_struct *w)
239662306a36Sopenharmony_ci{
239762306a36Sopenharmony_ci	struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
239862306a36Sopenharmony_ci
239962306a36Sopenharmony_ci	dm_bufio_lock(c);
240062306a36Sopenharmony_ci	__scan(c);
240162306a36Sopenharmony_ci	dm_bufio_unlock(c);
240262306a36Sopenharmony_ci}
240362306a36Sopenharmony_ci
240462306a36Sopenharmony_cistatic unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
240562306a36Sopenharmony_ci{
240662306a36Sopenharmony_ci	struct dm_bufio_client *c;
240762306a36Sopenharmony_ci
240862306a36Sopenharmony_ci	c = container_of(shrink, struct dm_bufio_client, shrinker);
240962306a36Sopenharmony_ci	atomic_long_add(sc->nr_to_scan, &c->need_shrink);
241062306a36Sopenharmony_ci	queue_work(dm_bufio_wq, &c->shrink_work);
241162306a36Sopenharmony_ci
241262306a36Sopenharmony_ci	return sc->nr_to_scan;
241362306a36Sopenharmony_ci}
241462306a36Sopenharmony_ci
241562306a36Sopenharmony_cistatic unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
241662306a36Sopenharmony_ci{
241762306a36Sopenharmony_ci	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
241862306a36Sopenharmony_ci	unsigned long count = cache_total(&c->cache);
241962306a36Sopenharmony_ci	unsigned long retain_target = get_retain_buffers(c);
242062306a36Sopenharmony_ci	unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
242162306a36Sopenharmony_ci
242262306a36Sopenharmony_ci	if (unlikely(count < retain_target))
242362306a36Sopenharmony_ci		count = 0;
242462306a36Sopenharmony_ci	else
242562306a36Sopenharmony_ci		count -= retain_target;
242662306a36Sopenharmony_ci
242762306a36Sopenharmony_ci	if (unlikely(count < queued_for_cleanup))
242862306a36Sopenharmony_ci		count = 0;
242962306a36Sopenharmony_ci	else
243062306a36Sopenharmony_ci		count -= queued_for_cleanup;
243162306a36Sopenharmony_ci
243262306a36Sopenharmony_ci	return count;
243362306a36Sopenharmony_ci}
243462306a36Sopenharmony_ci
243562306a36Sopenharmony_ci/*
243662306a36Sopenharmony_ci * Create the buffering interface
243762306a36Sopenharmony_ci */
243862306a36Sopenharmony_cistruct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned int block_size,
243962306a36Sopenharmony_ci					       unsigned int reserved_buffers, unsigned int aux_size,
244062306a36Sopenharmony_ci					       void (*alloc_callback)(struct dm_buffer *),
244162306a36Sopenharmony_ci					       void (*write_callback)(struct dm_buffer *),
244262306a36Sopenharmony_ci					       unsigned int flags)
244362306a36Sopenharmony_ci{
244462306a36Sopenharmony_ci	int r;
244562306a36Sopenharmony_ci	unsigned int num_locks;
244662306a36Sopenharmony_ci	struct dm_bufio_client *c;
244762306a36Sopenharmony_ci	char slab_name[27];
244862306a36Sopenharmony_ci
244962306a36Sopenharmony_ci	if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
245062306a36Sopenharmony_ci		DMERR("%s: block size not specified or is not multiple of 512b", __func__);
245162306a36Sopenharmony_ci		r = -EINVAL;
245262306a36Sopenharmony_ci		goto bad_client;
245362306a36Sopenharmony_ci	}
245462306a36Sopenharmony_ci
245562306a36Sopenharmony_ci	num_locks = dm_num_hash_locks();
245662306a36Sopenharmony_ci	c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL);
245762306a36Sopenharmony_ci	if (!c) {
245862306a36Sopenharmony_ci		r = -ENOMEM;
245962306a36Sopenharmony_ci		goto bad_client;
246062306a36Sopenharmony_ci	}
246162306a36Sopenharmony_ci	cache_init(&c->cache, num_locks, (flags & DM_BUFIO_CLIENT_NO_SLEEP) != 0);
246262306a36Sopenharmony_ci
246362306a36Sopenharmony_ci	c->bdev = bdev;
246462306a36Sopenharmony_ci	c->block_size = block_size;
246562306a36Sopenharmony_ci	if (is_power_of_2(block_size))
246662306a36Sopenharmony_ci		c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
246762306a36Sopenharmony_ci	else
246862306a36Sopenharmony_ci		c->sectors_per_block_bits = -1;
246962306a36Sopenharmony_ci
247062306a36Sopenharmony_ci	c->alloc_callback = alloc_callback;
247162306a36Sopenharmony_ci	c->write_callback = write_callback;
247262306a36Sopenharmony_ci
247362306a36Sopenharmony_ci	if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
247462306a36Sopenharmony_ci		c->no_sleep = true;
247562306a36Sopenharmony_ci		static_branch_inc(&no_sleep_enabled);
247662306a36Sopenharmony_ci	}
247762306a36Sopenharmony_ci
247862306a36Sopenharmony_ci	mutex_init(&c->lock);
247962306a36Sopenharmony_ci	spin_lock_init(&c->spinlock);
248062306a36Sopenharmony_ci	INIT_LIST_HEAD(&c->reserved_buffers);
248162306a36Sopenharmony_ci	c->need_reserved_buffers = reserved_buffers;
248262306a36Sopenharmony_ci
248362306a36Sopenharmony_ci	dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
248462306a36Sopenharmony_ci
248562306a36Sopenharmony_ci	init_waitqueue_head(&c->free_buffer_wait);
248662306a36Sopenharmony_ci	c->async_write_error = 0;
248762306a36Sopenharmony_ci
248862306a36Sopenharmony_ci	c->dm_io = dm_io_client_create();
248962306a36Sopenharmony_ci	if (IS_ERR(c->dm_io)) {
249062306a36Sopenharmony_ci		r = PTR_ERR(c->dm_io);
249162306a36Sopenharmony_ci		goto bad_dm_io;
249262306a36Sopenharmony_ci	}
249362306a36Sopenharmony_ci
249462306a36Sopenharmony_ci	if (block_size <= KMALLOC_MAX_SIZE &&
249562306a36Sopenharmony_ci	    (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
249662306a36Sopenharmony_ci		unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
249762306a36Sopenharmony_ci
249862306a36Sopenharmony_ci		snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u", block_size);
249962306a36Sopenharmony_ci		c->slab_cache = kmem_cache_create(slab_name, block_size, align,
250062306a36Sopenharmony_ci						  SLAB_RECLAIM_ACCOUNT, NULL);
250162306a36Sopenharmony_ci		if (!c->slab_cache) {
250262306a36Sopenharmony_ci			r = -ENOMEM;
250362306a36Sopenharmony_ci			goto bad;
250462306a36Sopenharmony_ci		}
250562306a36Sopenharmony_ci	}
250662306a36Sopenharmony_ci	if (aux_size)
250762306a36Sopenharmony_ci		snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u", aux_size);
250862306a36Sopenharmony_ci	else
250962306a36Sopenharmony_ci		snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer");
251062306a36Sopenharmony_ci	c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
251162306a36Sopenharmony_ci					   0, SLAB_RECLAIM_ACCOUNT, NULL);
251262306a36Sopenharmony_ci	if (!c->slab_buffer) {
251362306a36Sopenharmony_ci		r = -ENOMEM;
251462306a36Sopenharmony_ci		goto bad;
251562306a36Sopenharmony_ci	}
251662306a36Sopenharmony_ci
251762306a36Sopenharmony_ci	while (c->need_reserved_buffers) {
251862306a36Sopenharmony_ci		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
251962306a36Sopenharmony_ci
252062306a36Sopenharmony_ci		if (!b) {
252162306a36Sopenharmony_ci			r = -ENOMEM;
252262306a36Sopenharmony_ci			goto bad;
252362306a36Sopenharmony_ci		}
252462306a36Sopenharmony_ci		__free_buffer_wake(b);
252562306a36Sopenharmony_ci	}
252662306a36Sopenharmony_ci
252762306a36Sopenharmony_ci	INIT_WORK(&c->shrink_work, shrink_work);
252862306a36Sopenharmony_ci	atomic_long_set(&c->need_shrink, 0);
252962306a36Sopenharmony_ci
253062306a36Sopenharmony_ci	c->shrinker.count_objects = dm_bufio_shrink_count;
253162306a36Sopenharmony_ci	c->shrinker.scan_objects = dm_bufio_shrink_scan;
253262306a36Sopenharmony_ci	c->shrinker.seeks = 1;
253362306a36Sopenharmony_ci	c->shrinker.batch = 0;
253462306a36Sopenharmony_ci	r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)",
253562306a36Sopenharmony_ci			      MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
253662306a36Sopenharmony_ci	if (r)
253762306a36Sopenharmony_ci		goto bad;
253862306a36Sopenharmony_ci
253962306a36Sopenharmony_ci	mutex_lock(&dm_bufio_clients_lock);
254062306a36Sopenharmony_ci	dm_bufio_client_count++;
254162306a36Sopenharmony_ci	list_add(&c->client_list, &dm_bufio_all_clients);
254262306a36Sopenharmony_ci	__cache_size_refresh();
254362306a36Sopenharmony_ci	mutex_unlock(&dm_bufio_clients_lock);
254462306a36Sopenharmony_ci
254562306a36Sopenharmony_ci	return c;
254662306a36Sopenharmony_ci
254762306a36Sopenharmony_cibad:
254862306a36Sopenharmony_ci	while (!list_empty(&c->reserved_buffers)) {
254962306a36Sopenharmony_ci		struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
255062306a36Sopenharmony_ci
255162306a36Sopenharmony_ci		list_del(&b->lru.list);
255262306a36Sopenharmony_ci		free_buffer(b);
255362306a36Sopenharmony_ci	}
255462306a36Sopenharmony_ci	kmem_cache_destroy(c->slab_cache);
255562306a36Sopenharmony_ci	kmem_cache_destroy(c->slab_buffer);
255662306a36Sopenharmony_ci	dm_io_client_destroy(c->dm_io);
255762306a36Sopenharmony_cibad_dm_io:
255862306a36Sopenharmony_ci	mutex_destroy(&c->lock);
255962306a36Sopenharmony_ci	if (c->no_sleep)
256062306a36Sopenharmony_ci		static_branch_dec(&no_sleep_enabled);
256162306a36Sopenharmony_ci	kfree(c);
256262306a36Sopenharmony_cibad_client:
256362306a36Sopenharmony_ci	return ERR_PTR(r);
256462306a36Sopenharmony_ci}
256562306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_client_create);
256662306a36Sopenharmony_ci
256762306a36Sopenharmony_ci/*
256862306a36Sopenharmony_ci * Free the buffering interface.
256962306a36Sopenharmony_ci * It is required that there are no references on any buffers.
257062306a36Sopenharmony_ci */
257162306a36Sopenharmony_civoid dm_bufio_client_destroy(struct dm_bufio_client *c)
257262306a36Sopenharmony_ci{
257362306a36Sopenharmony_ci	unsigned int i;
257462306a36Sopenharmony_ci
257562306a36Sopenharmony_ci	drop_buffers(c);
257662306a36Sopenharmony_ci
257762306a36Sopenharmony_ci	unregister_shrinker(&c->shrinker);
257862306a36Sopenharmony_ci	flush_work(&c->shrink_work);
257962306a36Sopenharmony_ci
258062306a36Sopenharmony_ci	mutex_lock(&dm_bufio_clients_lock);
258162306a36Sopenharmony_ci
258262306a36Sopenharmony_ci	list_del(&c->client_list);
258362306a36Sopenharmony_ci	dm_bufio_client_count--;
258462306a36Sopenharmony_ci	__cache_size_refresh();
258562306a36Sopenharmony_ci
258662306a36Sopenharmony_ci	mutex_unlock(&dm_bufio_clients_lock);
258762306a36Sopenharmony_ci
258862306a36Sopenharmony_ci	WARN_ON(c->need_reserved_buffers);
258962306a36Sopenharmony_ci
259062306a36Sopenharmony_ci	while (!list_empty(&c->reserved_buffers)) {
259162306a36Sopenharmony_ci		struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
259262306a36Sopenharmony_ci
259362306a36Sopenharmony_ci		list_del(&b->lru.list);
259462306a36Sopenharmony_ci		free_buffer(b);
259562306a36Sopenharmony_ci	}
259662306a36Sopenharmony_ci
259762306a36Sopenharmony_ci	for (i = 0; i < LIST_SIZE; i++)
259862306a36Sopenharmony_ci		if (cache_count(&c->cache, i))
259962306a36Sopenharmony_ci			DMERR("leaked buffer count %d: %lu", i, cache_count(&c->cache, i));
260062306a36Sopenharmony_ci
260162306a36Sopenharmony_ci	for (i = 0; i < LIST_SIZE; i++)
260262306a36Sopenharmony_ci		WARN_ON(cache_count(&c->cache, i));
260362306a36Sopenharmony_ci
260462306a36Sopenharmony_ci	cache_destroy(&c->cache);
260562306a36Sopenharmony_ci	kmem_cache_destroy(c->slab_cache);
260662306a36Sopenharmony_ci	kmem_cache_destroy(c->slab_buffer);
260762306a36Sopenharmony_ci	dm_io_client_destroy(c->dm_io);
260862306a36Sopenharmony_ci	mutex_destroy(&c->lock);
260962306a36Sopenharmony_ci	if (c->no_sleep)
261062306a36Sopenharmony_ci		static_branch_dec(&no_sleep_enabled);
261162306a36Sopenharmony_ci	kfree(c);
261262306a36Sopenharmony_ci}
261362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
261462306a36Sopenharmony_ci
261562306a36Sopenharmony_civoid dm_bufio_client_reset(struct dm_bufio_client *c)
261662306a36Sopenharmony_ci{
261762306a36Sopenharmony_ci	drop_buffers(c);
261862306a36Sopenharmony_ci	flush_work(&c->shrink_work);
261962306a36Sopenharmony_ci}
262062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_client_reset);
262162306a36Sopenharmony_ci
262262306a36Sopenharmony_civoid dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
262362306a36Sopenharmony_ci{
262462306a36Sopenharmony_ci	c->start = start;
262562306a36Sopenharmony_ci}
262662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
262762306a36Sopenharmony_ci
262862306a36Sopenharmony_ci/*--------------------------------------------------------------*/
262962306a36Sopenharmony_ci
263062306a36Sopenharmony_cistatic unsigned int get_max_age_hz(void)
263162306a36Sopenharmony_ci{
263262306a36Sopenharmony_ci	unsigned int max_age = READ_ONCE(dm_bufio_max_age);
263362306a36Sopenharmony_ci
263462306a36Sopenharmony_ci	if (max_age > UINT_MAX / HZ)
263562306a36Sopenharmony_ci		max_age = UINT_MAX / HZ;
263662306a36Sopenharmony_ci
263762306a36Sopenharmony_ci	return max_age * HZ;
263862306a36Sopenharmony_ci}
263962306a36Sopenharmony_ci
264062306a36Sopenharmony_cistatic bool older_than(struct dm_buffer *b, unsigned long age_hz)
264162306a36Sopenharmony_ci{
264262306a36Sopenharmony_ci	return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz);
264362306a36Sopenharmony_ci}
264462306a36Sopenharmony_ci
264562306a36Sopenharmony_cistruct evict_params {
264662306a36Sopenharmony_ci	gfp_t gfp;
264762306a36Sopenharmony_ci	unsigned long age_hz;
264862306a36Sopenharmony_ci
264962306a36Sopenharmony_ci	/*
265062306a36Sopenharmony_ci	 * This gets updated with the largest last_accessed (ie. most
265162306a36Sopenharmony_ci	 * recently used) of the evicted buffers.  It will not be reinitialised
265262306a36Sopenharmony_ci	 * by __evict_many(), so you can use it across multiple invocations.
265362306a36Sopenharmony_ci	 */
265462306a36Sopenharmony_ci	unsigned long last_accessed;
265562306a36Sopenharmony_ci};
265662306a36Sopenharmony_ci
265762306a36Sopenharmony_ci/*
265862306a36Sopenharmony_ci * We may not be able to evict this buffer if IO pending or the client
265962306a36Sopenharmony_ci * is still using it.
266062306a36Sopenharmony_ci *
266162306a36Sopenharmony_ci * And if GFP_NOFS is used, we must not do any I/O because we hold
266262306a36Sopenharmony_ci * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
266362306a36Sopenharmony_ci * rerouted to different bufio client.
266462306a36Sopenharmony_ci */
266562306a36Sopenharmony_cistatic enum evict_result select_for_evict(struct dm_buffer *b, void *context)
266662306a36Sopenharmony_ci{
266762306a36Sopenharmony_ci	struct evict_params *params = context;
266862306a36Sopenharmony_ci
266962306a36Sopenharmony_ci	if (!(params->gfp & __GFP_FS) ||
267062306a36Sopenharmony_ci	    (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
267162306a36Sopenharmony_ci		if (test_bit_acquire(B_READING, &b->state) ||
267262306a36Sopenharmony_ci		    test_bit(B_WRITING, &b->state) ||
267362306a36Sopenharmony_ci		    test_bit(B_DIRTY, &b->state))
267462306a36Sopenharmony_ci			return ER_DONT_EVICT;
267562306a36Sopenharmony_ci	}
267662306a36Sopenharmony_ci
267762306a36Sopenharmony_ci	return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP;
267862306a36Sopenharmony_ci}
267962306a36Sopenharmony_ci
268062306a36Sopenharmony_cistatic unsigned long __evict_many(struct dm_bufio_client *c,
268162306a36Sopenharmony_ci				  struct evict_params *params,
268262306a36Sopenharmony_ci				  int list_mode, unsigned long max_count)
268362306a36Sopenharmony_ci{
268462306a36Sopenharmony_ci	unsigned long count;
268562306a36Sopenharmony_ci	unsigned long last_accessed;
268662306a36Sopenharmony_ci	struct dm_buffer *b;
268762306a36Sopenharmony_ci
268862306a36Sopenharmony_ci	for (count = 0; count < max_count; count++) {
268962306a36Sopenharmony_ci		b = cache_evict(&c->cache, list_mode, select_for_evict, params);
269062306a36Sopenharmony_ci		if (!b)
269162306a36Sopenharmony_ci			break;
269262306a36Sopenharmony_ci
269362306a36Sopenharmony_ci		last_accessed = READ_ONCE(b->last_accessed);
269462306a36Sopenharmony_ci		if (time_after_eq(params->last_accessed, last_accessed))
269562306a36Sopenharmony_ci			params->last_accessed = last_accessed;
269662306a36Sopenharmony_ci
269762306a36Sopenharmony_ci		__make_buffer_clean(b);
269862306a36Sopenharmony_ci		__free_buffer_wake(b);
269962306a36Sopenharmony_ci
270062306a36Sopenharmony_ci		cond_resched();
270162306a36Sopenharmony_ci	}
270262306a36Sopenharmony_ci
270362306a36Sopenharmony_ci	return count;
270462306a36Sopenharmony_ci}
270562306a36Sopenharmony_ci
270662306a36Sopenharmony_cistatic void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
270762306a36Sopenharmony_ci{
270862306a36Sopenharmony_ci	struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0};
270962306a36Sopenharmony_ci	unsigned long retain = get_retain_buffers(c);
271062306a36Sopenharmony_ci	unsigned long count;
271162306a36Sopenharmony_ci	LIST_HEAD(write_list);
271262306a36Sopenharmony_ci
271362306a36Sopenharmony_ci	dm_bufio_lock(c);
271462306a36Sopenharmony_ci
271562306a36Sopenharmony_ci	__check_watermark(c, &write_list);
271662306a36Sopenharmony_ci	if (unlikely(!list_empty(&write_list))) {
271762306a36Sopenharmony_ci		dm_bufio_unlock(c);
271862306a36Sopenharmony_ci		__flush_write_list(&write_list);
271962306a36Sopenharmony_ci		dm_bufio_lock(c);
272062306a36Sopenharmony_ci	}
272162306a36Sopenharmony_ci
272262306a36Sopenharmony_ci	count = cache_total(&c->cache);
272362306a36Sopenharmony_ci	if (count > retain)
272462306a36Sopenharmony_ci		__evict_many(c, &params, LIST_CLEAN, count - retain);
272562306a36Sopenharmony_ci
272662306a36Sopenharmony_ci	dm_bufio_unlock(c);
272762306a36Sopenharmony_ci}
272862306a36Sopenharmony_ci
272962306a36Sopenharmony_cistatic void cleanup_old_buffers(void)
273062306a36Sopenharmony_ci{
273162306a36Sopenharmony_ci	unsigned long max_age_hz = get_max_age_hz();
273262306a36Sopenharmony_ci	struct dm_bufio_client *c;
273362306a36Sopenharmony_ci
273462306a36Sopenharmony_ci	mutex_lock(&dm_bufio_clients_lock);
273562306a36Sopenharmony_ci
273662306a36Sopenharmony_ci	__cache_size_refresh();
273762306a36Sopenharmony_ci
273862306a36Sopenharmony_ci	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
273962306a36Sopenharmony_ci		evict_old_buffers(c, max_age_hz);
274062306a36Sopenharmony_ci
274162306a36Sopenharmony_ci	mutex_unlock(&dm_bufio_clients_lock);
274262306a36Sopenharmony_ci}
274362306a36Sopenharmony_ci
274462306a36Sopenharmony_cistatic void work_fn(struct work_struct *w)
274562306a36Sopenharmony_ci{
274662306a36Sopenharmony_ci	cleanup_old_buffers();
274762306a36Sopenharmony_ci
274862306a36Sopenharmony_ci	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
274962306a36Sopenharmony_ci			   DM_BUFIO_WORK_TIMER_SECS * HZ);
275062306a36Sopenharmony_ci}
275162306a36Sopenharmony_ci
275262306a36Sopenharmony_ci/*--------------------------------------------------------------*/
275362306a36Sopenharmony_ci
275462306a36Sopenharmony_ci/*
275562306a36Sopenharmony_ci * Global cleanup tries to evict the oldest buffers from across _all_
275662306a36Sopenharmony_ci * the clients.  It does this by repeatedly evicting a few buffers from
275762306a36Sopenharmony_ci * the client that holds the oldest buffer.  It's approximate, but hopefully
275862306a36Sopenharmony_ci * good enough.
275962306a36Sopenharmony_ci */
276062306a36Sopenharmony_cistatic struct dm_bufio_client *__pop_client(void)
276162306a36Sopenharmony_ci{
276262306a36Sopenharmony_ci	struct list_head *h;
276362306a36Sopenharmony_ci
276462306a36Sopenharmony_ci	if (list_empty(&dm_bufio_all_clients))
276562306a36Sopenharmony_ci		return NULL;
276662306a36Sopenharmony_ci
276762306a36Sopenharmony_ci	h = dm_bufio_all_clients.next;
276862306a36Sopenharmony_ci	list_del(h);
276962306a36Sopenharmony_ci	return container_of(h, struct dm_bufio_client, client_list);
277062306a36Sopenharmony_ci}
277162306a36Sopenharmony_ci
277262306a36Sopenharmony_ci/*
277362306a36Sopenharmony_ci * Inserts the client in the global client list based on its
277462306a36Sopenharmony_ci * 'oldest_buffer' field.
277562306a36Sopenharmony_ci */
277662306a36Sopenharmony_cistatic void __insert_client(struct dm_bufio_client *new_client)
277762306a36Sopenharmony_ci{
277862306a36Sopenharmony_ci	struct dm_bufio_client *c;
277962306a36Sopenharmony_ci	struct list_head *h = dm_bufio_all_clients.next;
278062306a36Sopenharmony_ci
278162306a36Sopenharmony_ci	while (h != &dm_bufio_all_clients) {
278262306a36Sopenharmony_ci		c = container_of(h, struct dm_bufio_client, client_list);
278362306a36Sopenharmony_ci		if (time_after_eq(c->oldest_buffer, new_client->oldest_buffer))
278462306a36Sopenharmony_ci			break;
278562306a36Sopenharmony_ci		h = h->next;
278662306a36Sopenharmony_ci	}
278762306a36Sopenharmony_ci
278862306a36Sopenharmony_ci	list_add_tail(&new_client->client_list, h);
278962306a36Sopenharmony_ci}
279062306a36Sopenharmony_ci
279162306a36Sopenharmony_cistatic unsigned long __evict_a_few(unsigned long nr_buffers)
279262306a36Sopenharmony_ci{
279362306a36Sopenharmony_ci	unsigned long count;
279462306a36Sopenharmony_ci	struct dm_bufio_client *c;
279562306a36Sopenharmony_ci	struct evict_params params = {
279662306a36Sopenharmony_ci		.gfp = GFP_KERNEL,
279762306a36Sopenharmony_ci		.age_hz = 0,
279862306a36Sopenharmony_ci		/* set to jiffies in case there are no buffers in this client */
279962306a36Sopenharmony_ci		.last_accessed = jiffies
280062306a36Sopenharmony_ci	};
280162306a36Sopenharmony_ci
280262306a36Sopenharmony_ci	c = __pop_client();
280362306a36Sopenharmony_ci	if (!c)
280462306a36Sopenharmony_ci		return 0;
280562306a36Sopenharmony_ci
280662306a36Sopenharmony_ci	dm_bufio_lock(c);
280762306a36Sopenharmony_ci	count = __evict_many(c, &params, LIST_CLEAN, nr_buffers);
280862306a36Sopenharmony_ci	dm_bufio_unlock(c);
280962306a36Sopenharmony_ci
281062306a36Sopenharmony_ci	if (count)
281162306a36Sopenharmony_ci		c->oldest_buffer = params.last_accessed;
281262306a36Sopenharmony_ci	__insert_client(c);
281362306a36Sopenharmony_ci
281462306a36Sopenharmony_ci	return count;
281562306a36Sopenharmony_ci}
281662306a36Sopenharmony_ci
281762306a36Sopenharmony_cistatic void check_watermarks(void)
281862306a36Sopenharmony_ci{
281962306a36Sopenharmony_ci	LIST_HEAD(write_list);
282062306a36Sopenharmony_ci	struct dm_bufio_client *c;
282162306a36Sopenharmony_ci
282262306a36Sopenharmony_ci	mutex_lock(&dm_bufio_clients_lock);
282362306a36Sopenharmony_ci	list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
282462306a36Sopenharmony_ci		dm_bufio_lock(c);
282562306a36Sopenharmony_ci		__check_watermark(c, &write_list);
282662306a36Sopenharmony_ci		dm_bufio_unlock(c);
282762306a36Sopenharmony_ci	}
282862306a36Sopenharmony_ci	mutex_unlock(&dm_bufio_clients_lock);
282962306a36Sopenharmony_ci
283062306a36Sopenharmony_ci	__flush_write_list(&write_list);
283162306a36Sopenharmony_ci}
283262306a36Sopenharmony_ci
283362306a36Sopenharmony_cistatic void evict_old(void)
283462306a36Sopenharmony_ci{
283562306a36Sopenharmony_ci	unsigned long threshold = dm_bufio_cache_size -
283662306a36Sopenharmony_ci		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
283762306a36Sopenharmony_ci
283862306a36Sopenharmony_ci	mutex_lock(&dm_bufio_clients_lock);
283962306a36Sopenharmony_ci	while (dm_bufio_current_allocated > threshold) {
284062306a36Sopenharmony_ci		if (!__evict_a_few(64))
284162306a36Sopenharmony_ci			break;
284262306a36Sopenharmony_ci		cond_resched();
284362306a36Sopenharmony_ci	}
284462306a36Sopenharmony_ci	mutex_unlock(&dm_bufio_clients_lock);
284562306a36Sopenharmony_ci}
284662306a36Sopenharmony_ci
284762306a36Sopenharmony_cistatic void do_global_cleanup(struct work_struct *w)
284862306a36Sopenharmony_ci{
284962306a36Sopenharmony_ci	check_watermarks();
285062306a36Sopenharmony_ci	evict_old();
285162306a36Sopenharmony_ci}
285262306a36Sopenharmony_ci
285362306a36Sopenharmony_ci/*
285462306a36Sopenharmony_ci *--------------------------------------------------------------
285562306a36Sopenharmony_ci * Module setup
285662306a36Sopenharmony_ci *--------------------------------------------------------------
285762306a36Sopenharmony_ci */
285862306a36Sopenharmony_ci
285962306a36Sopenharmony_ci/*
286062306a36Sopenharmony_ci * This is called only once for the whole dm_bufio module.
286162306a36Sopenharmony_ci * It initializes memory limit.
286262306a36Sopenharmony_ci */
286362306a36Sopenharmony_cistatic int __init dm_bufio_init(void)
286462306a36Sopenharmony_ci{
286562306a36Sopenharmony_ci	__u64 mem;
286662306a36Sopenharmony_ci
286762306a36Sopenharmony_ci	dm_bufio_allocated_kmem_cache = 0;
286862306a36Sopenharmony_ci	dm_bufio_allocated_get_free_pages = 0;
286962306a36Sopenharmony_ci	dm_bufio_allocated_vmalloc = 0;
287062306a36Sopenharmony_ci	dm_bufio_current_allocated = 0;
287162306a36Sopenharmony_ci
287262306a36Sopenharmony_ci	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
287362306a36Sopenharmony_ci			       DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
287462306a36Sopenharmony_ci
287562306a36Sopenharmony_ci	if (mem > ULONG_MAX)
287662306a36Sopenharmony_ci		mem = ULONG_MAX;
287762306a36Sopenharmony_ci
287862306a36Sopenharmony_ci#ifdef CONFIG_MMU
287962306a36Sopenharmony_ci	if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
288062306a36Sopenharmony_ci		mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
288162306a36Sopenharmony_ci#endif
288262306a36Sopenharmony_ci
288362306a36Sopenharmony_ci	dm_bufio_default_cache_size = mem;
288462306a36Sopenharmony_ci
288562306a36Sopenharmony_ci	mutex_lock(&dm_bufio_clients_lock);
288662306a36Sopenharmony_ci	__cache_size_refresh();
288762306a36Sopenharmony_ci	mutex_unlock(&dm_bufio_clients_lock);
288862306a36Sopenharmony_ci
288962306a36Sopenharmony_ci	dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
289062306a36Sopenharmony_ci	if (!dm_bufio_wq)
289162306a36Sopenharmony_ci		return -ENOMEM;
289262306a36Sopenharmony_ci
289362306a36Sopenharmony_ci	INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
289462306a36Sopenharmony_ci	INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
289562306a36Sopenharmony_ci	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
289662306a36Sopenharmony_ci			   DM_BUFIO_WORK_TIMER_SECS * HZ);
289762306a36Sopenharmony_ci
289862306a36Sopenharmony_ci	return 0;
289962306a36Sopenharmony_ci}
290062306a36Sopenharmony_ci
290162306a36Sopenharmony_ci/*
290262306a36Sopenharmony_ci * This is called once when unloading the dm_bufio module.
290362306a36Sopenharmony_ci */
290462306a36Sopenharmony_cistatic void __exit dm_bufio_exit(void)
290562306a36Sopenharmony_ci{
290662306a36Sopenharmony_ci	int bug = 0;
290762306a36Sopenharmony_ci
290862306a36Sopenharmony_ci	cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
290962306a36Sopenharmony_ci	destroy_workqueue(dm_bufio_wq);
291062306a36Sopenharmony_ci
291162306a36Sopenharmony_ci	if (dm_bufio_client_count) {
291262306a36Sopenharmony_ci		DMCRIT("%s: dm_bufio_client_count leaked: %d",
291362306a36Sopenharmony_ci			__func__, dm_bufio_client_count);
291462306a36Sopenharmony_ci		bug = 1;
291562306a36Sopenharmony_ci	}
291662306a36Sopenharmony_ci
291762306a36Sopenharmony_ci	if (dm_bufio_current_allocated) {
291862306a36Sopenharmony_ci		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
291962306a36Sopenharmony_ci			__func__, dm_bufio_current_allocated);
292062306a36Sopenharmony_ci		bug = 1;
292162306a36Sopenharmony_ci	}
292262306a36Sopenharmony_ci
292362306a36Sopenharmony_ci	if (dm_bufio_allocated_get_free_pages) {
292462306a36Sopenharmony_ci		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
292562306a36Sopenharmony_ci		       __func__, dm_bufio_allocated_get_free_pages);
292662306a36Sopenharmony_ci		bug = 1;
292762306a36Sopenharmony_ci	}
292862306a36Sopenharmony_ci
292962306a36Sopenharmony_ci	if (dm_bufio_allocated_vmalloc) {
293062306a36Sopenharmony_ci		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
293162306a36Sopenharmony_ci		       __func__, dm_bufio_allocated_vmalloc);
293262306a36Sopenharmony_ci		bug = 1;
293362306a36Sopenharmony_ci	}
293462306a36Sopenharmony_ci
293562306a36Sopenharmony_ci	WARN_ON(bug); /* leaks are not worth crashing the system */
293662306a36Sopenharmony_ci}
293762306a36Sopenharmony_ci
293862306a36Sopenharmony_cimodule_init(dm_bufio_init)
293962306a36Sopenharmony_cimodule_exit(dm_bufio_exit)
294062306a36Sopenharmony_ci
294162306a36Sopenharmony_cimodule_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644);
294262306a36Sopenharmony_ciMODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
294362306a36Sopenharmony_ci
294462306a36Sopenharmony_cimodule_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644);
294562306a36Sopenharmony_ciMODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
294662306a36Sopenharmony_ci
294762306a36Sopenharmony_cimodule_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644);
294862306a36Sopenharmony_ciMODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
294962306a36Sopenharmony_ci
295062306a36Sopenharmony_cimodule_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, 0644);
295162306a36Sopenharmony_ciMODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
295262306a36Sopenharmony_ci
295362306a36Sopenharmony_cimodule_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, 0444);
295462306a36Sopenharmony_ciMODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
295562306a36Sopenharmony_ci
295662306a36Sopenharmony_cimodule_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, 0444);
295762306a36Sopenharmony_ciMODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
295862306a36Sopenharmony_ci
295962306a36Sopenharmony_cimodule_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, 0444);
296062306a36Sopenharmony_ciMODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
296162306a36Sopenharmony_ci
296262306a36Sopenharmony_cimodule_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, 0444);
296362306a36Sopenharmony_ciMODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
296462306a36Sopenharmony_ci
296562306a36Sopenharmony_ciMODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
296662306a36Sopenharmony_ciMODULE_DESCRIPTION(DM_NAME " buffered I/O library");
296762306a36Sopenharmony_ciMODULE_LICENSE("GPL");
2968