18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
28c2ecf20Sopenharmony_ci/* -*- mode: c; c-basic-offset: 8; -*-
38c2ecf20Sopenharmony_ci * vim: noexpandtab sw=8 ts=8 sts=0:
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * uptodate.c
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * Tracking the up-to-date-ness of a local buffer_head with respect to
88c2ecf20Sopenharmony_ci * the cluster.
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
118c2ecf20Sopenharmony_ci *
128c2ecf20Sopenharmony_ci * Standard buffer head caching flags (uptodate, etc) are insufficient
138c2ecf20Sopenharmony_ci * in a clustered environment - a buffer may be marked up to date on
148c2ecf20Sopenharmony_ci * our local node but could have been modified by another cluster
158c2ecf20Sopenharmony_ci * member. As a result an additional (and performant) caching scheme
168c2ecf20Sopenharmony_ci * is required. A further requirement is that we consume as little
178c2ecf20Sopenharmony_ci * memory as possible - we never pin buffer_head structures in order
188c2ecf20Sopenharmony_ci * to cache them.
198c2ecf20Sopenharmony_ci *
208c2ecf20Sopenharmony_ci * We track the existence of up to date buffers on the inodes which
218c2ecf20Sopenharmony_ci * are associated with them. Because we don't want to pin
228c2ecf20Sopenharmony_ci * buffer_heads, this is only a (strong) hint and several other checks
238c2ecf20Sopenharmony_ci * are made in the I/O path to ensure that we don't use a stale or
248c2ecf20Sopenharmony_ci * invalid buffer without going to disk:
258c2ecf20Sopenharmony_ci *	- buffer_jbd is used liberally - if a bh is in the journal on
268c2ecf20Sopenharmony_ci *	  this node then it *must* be up to date.
278c2ecf20Sopenharmony_ci *	- the standard buffer_uptodate() macro is used to detect buffers
288c2ecf20Sopenharmony_ci *	  which may be invalid (even if we have an up to date tracking
298c2ecf20Sopenharmony_ci * 	  item for them)
308c2ecf20Sopenharmony_ci *
318c2ecf20Sopenharmony_ci * For a full understanding of how this code works together, one
328c2ecf20Sopenharmony_ci * should read the callers in dlmglue.c, the I/O functions in
338c2ecf20Sopenharmony_ci * buffer_head_io.c and ocfs2_journal_access in journal.c
348c2ecf20Sopenharmony_ci */
358c2ecf20Sopenharmony_ci
368c2ecf20Sopenharmony_ci#include <linux/fs.h>
378c2ecf20Sopenharmony_ci#include <linux/types.h>
388c2ecf20Sopenharmony_ci#include <linux/slab.h>
398c2ecf20Sopenharmony_ci#include <linux/highmem.h>
408c2ecf20Sopenharmony_ci#include <linux/buffer_head.h>
418c2ecf20Sopenharmony_ci#include <linux/rbtree.h>
428c2ecf20Sopenharmony_ci
438c2ecf20Sopenharmony_ci#include <cluster/masklog.h>
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_ci#include "ocfs2.h"
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci#include "inode.h"
488c2ecf20Sopenharmony_ci#include "uptodate.h"
498c2ecf20Sopenharmony_ci#include "ocfs2_trace.h"
508c2ecf20Sopenharmony_ci
518c2ecf20Sopenharmony_cistruct ocfs2_meta_cache_item {
528c2ecf20Sopenharmony_ci	struct rb_node	c_node;
538c2ecf20Sopenharmony_ci	sector_t	c_block;
548c2ecf20Sopenharmony_ci};
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_cistatic struct kmem_cache *ocfs2_uptodate_cachep;
578c2ecf20Sopenharmony_ci
588c2ecf20Sopenharmony_ciu64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
598c2ecf20Sopenharmony_ci{
608c2ecf20Sopenharmony_ci	BUG_ON(!ci || !ci->ci_ops);
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci	return ci->ci_ops->co_owner(ci);
638c2ecf20Sopenharmony_ci}
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_cistruct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci)
668c2ecf20Sopenharmony_ci{
678c2ecf20Sopenharmony_ci	BUG_ON(!ci || !ci->ci_ops);
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci	return ci->ci_ops->co_get_super(ci);
708c2ecf20Sopenharmony_ci}
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_cistatic void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci)
738c2ecf20Sopenharmony_ci{
748c2ecf20Sopenharmony_ci	BUG_ON(!ci || !ci->ci_ops);
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ci	ci->ci_ops->co_cache_lock(ci);
778c2ecf20Sopenharmony_ci}
788c2ecf20Sopenharmony_ci
798c2ecf20Sopenharmony_cistatic void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci)
808c2ecf20Sopenharmony_ci{
818c2ecf20Sopenharmony_ci	BUG_ON(!ci || !ci->ci_ops);
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_ci	ci->ci_ops->co_cache_unlock(ci);
848c2ecf20Sopenharmony_ci}
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_civoid ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci)
878c2ecf20Sopenharmony_ci{
888c2ecf20Sopenharmony_ci	BUG_ON(!ci || !ci->ci_ops);
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci	ci->ci_ops->co_io_lock(ci);
918c2ecf20Sopenharmony_ci}
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_civoid ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci)
948c2ecf20Sopenharmony_ci{
958c2ecf20Sopenharmony_ci	BUG_ON(!ci || !ci->ci_ops);
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci	ci->ci_ops->co_io_unlock(ci);
988c2ecf20Sopenharmony_ci}
998c2ecf20Sopenharmony_ci
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_cistatic void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci,
1028c2ecf20Sopenharmony_ci				       int clear)
1038c2ecf20Sopenharmony_ci{
1048c2ecf20Sopenharmony_ci	ci->ci_flags |= OCFS2_CACHE_FL_INLINE;
1058c2ecf20Sopenharmony_ci	ci->ci_num_cached = 0;
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci	if (clear) {
1088c2ecf20Sopenharmony_ci		ci->ci_created_trans = 0;
1098c2ecf20Sopenharmony_ci		ci->ci_last_trans = 0;
1108c2ecf20Sopenharmony_ci	}
1118c2ecf20Sopenharmony_ci}
1128c2ecf20Sopenharmony_ci
1138c2ecf20Sopenharmony_civoid ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
1148c2ecf20Sopenharmony_ci			       const struct ocfs2_caching_operations *ops)
1158c2ecf20Sopenharmony_ci{
1168c2ecf20Sopenharmony_ci	BUG_ON(!ops);
1178c2ecf20Sopenharmony_ci
1188c2ecf20Sopenharmony_ci	ci->ci_ops = ops;
1198c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_reset(ci, 1);
1208c2ecf20Sopenharmony_ci}
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_civoid ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci)
1238c2ecf20Sopenharmony_ci{
1248c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_purge(ci);
1258c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_reset(ci, 1);
1268c2ecf20Sopenharmony_ci}
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_ci/* No lock taken here as 'root' is not expected to be visible to other
1308c2ecf20Sopenharmony_ci * processes. */
1318c2ecf20Sopenharmony_cistatic unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
1328c2ecf20Sopenharmony_ci{
1338c2ecf20Sopenharmony_ci	unsigned int purged = 0;
1348c2ecf20Sopenharmony_ci	struct rb_node *node;
1358c2ecf20Sopenharmony_ci	struct ocfs2_meta_cache_item *item;
1368c2ecf20Sopenharmony_ci
1378c2ecf20Sopenharmony_ci	while ((node = rb_last(root)) != NULL) {
1388c2ecf20Sopenharmony_ci		item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
1398c2ecf20Sopenharmony_ci
1408c2ecf20Sopenharmony_ci		trace_ocfs2_purge_copied_metadata_tree(
1418c2ecf20Sopenharmony_ci					(unsigned long long) item->c_block);
1428c2ecf20Sopenharmony_ci
1438c2ecf20Sopenharmony_ci		rb_erase(&item->c_node, root);
1448c2ecf20Sopenharmony_ci		kmem_cache_free(ocfs2_uptodate_cachep, item);
1458c2ecf20Sopenharmony_ci
1468c2ecf20Sopenharmony_ci		purged++;
1478c2ecf20Sopenharmony_ci	}
1488c2ecf20Sopenharmony_ci	return purged;
1498c2ecf20Sopenharmony_ci}
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci/* Called from locking and called from ocfs2_clear_inode. Dump the
1528c2ecf20Sopenharmony_ci * cache for a given inode.
1538c2ecf20Sopenharmony_ci *
1548c2ecf20Sopenharmony_ci * This function is a few more lines longer than necessary due to some
1558c2ecf20Sopenharmony_ci * accounting done here, but I think it's worth tracking down those
1568c2ecf20Sopenharmony_ci * bugs sooner -- Mark */
1578c2ecf20Sopenharmony_civoid ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
1588c2ecf20Sopenharmony_ci{
1598c2ecf20Sopenharmony_ci	unsigned int tree, to_purge, purged;
1608c2ecf20Sopenharmony_ci	struct rb_root root = RB_ROOT;
1618c2ecf20Sopenharmony_ci
1628c2ecf20Sopenharmony_ci	BUG_ON(!ci || !ci->ci_ops);
1638c2ecf20Sopenharmony_ci
1648c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_lock(ci);
1658c2ecf20Sopenharmony_ci	tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
1668c2ecf20Sopenharmony_ci	to_purge = ci->ci_num_cached;
1678c2ecf20Sopenharmony_ci
1688c2ecf20Sopenharmony_ci	trace_ocfs2_metadata_cache_purge(
1698c2ecf20Sopenharmony_ci		(unsigned long long)ocfs2_metadata_cache_owner(ci),
1708c2ecf20Sopenharmony_ci		to_purge, tree);
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci	/* If we're a tree, save off the root so that we can safely
1738c2ecf20Sopenharmony_ci	 * initialize the cache. We do the work to free tree members
1748c2ecf20Sopenharmony_ci	 * without the spinlock. */
1758c2ecf20Sopenharmony_ci	if (tree)
1768c2ecf20Sopenharmony_ci		root = ci->ci_cache.ci_tree;
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_reset(ci, 0);
1798c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_unlock(ci);
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_ci	purged = ocfs2_purge_copied_metadata_tree(&root);
1828c2ecf20Sopenharmony_ci	/* If possible, track the number wiped so that we can more
1838c2ecf20Sopenharmony_ci	 * easily detect counting errors. Unfortunately, this is only
1848c2ecf20Sopenharmony_ci	 * meaningful for trees. */
1858c2ecf20Sopenharmony_ci	if (tree && purged != to_purge)
1868c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n",
1878c2ecf20Sopenharmony_ci		     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1888c2ecf20Sopenharmony_ci		     to_purge, purged);
1898c2ecf20Sopenharmony_ci}
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci/* Returns the index in the cache array, -1 if not found.
1928c2ecf20Sopenharmony_ci * Requires ip_lock. */
1938c2ecf20Sopenharmony_cistatic int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
1948c2ecf20Sopenharmony_ci				    sector_t item)
1958c2ecf20Sopenharmony_ci{
1968c2ecf20Sopenharmony_ci	int i;
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ci	for (i = 0; i < ci->ci_num_cached; i++) {
1998c2ecf20Sopenharmony_ci		if (item == ci->ci_cache.ci_array[i])
2008c2ecf20Sopenharmony_ci			return i;
2018c2ecf20Sopenharmony_ci	}
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	return -1;
2048c2ecf20Sopenharmony_ci}
2058c2ecf20Sopenharmony_ci
2068c2ecf20Sopenharmony_ci/* Returns the cache item if found, otherwise NULL.
2078c2ecf20Sopenharmony_ci * Requires ip_lock. */
2088c2ecf20Sopenharmony_cistatic struct ocfs2_meta_cache_item *
2098c2ecf20Sopenharmony_ciocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
2108c2ecf20Sopenharmony_ci			sector_t block)
2118c2ecf20Sopenharmony_ci{
2128c2ecf20Sopenharmony_ci	struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
2138c2ecf20Sopenharmony_ci	struct ocfs2_meta_cache_item *item = NULL;
2148c2ecf20Sopenharmony_ci
2158c2ecf20Sopenharmony_ci	while (n) {
2168c2ecf20Sopenharmony_ci		item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ci		if (block < item->c_block)
2198c2ecf20Sopenharmony_ci			n = n->rb_left;
2208c2ecf20Sopenharmony_ci		else if (block > item->c_block)
2218c2ecf20Sopenharmony_ci			n = n->rb_right;
2228c2ecf20Sopenharmony_ci		else
2238c2ecf20Sopenharmony_ci			return item;
2248c2ecf20Sopenharmony_ci	}
2258c2ecf20Sopenharmony_ci
2268c2ecf20Sopenharmony_ci	return NULL;
2278c2ecf20Sopenharmony_ci}
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_cistatic int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
2308c2ecf20Sopenharmony_ci			       struct buffer_head *bh)
2318c2ecf20Sopenharmony_ci{
2328c2ecf20Sopenharmony_ci	int index = -1;
2338c2ecf20Sopenharmony_ci	struct ocfs2_meta_cache_item *item = NULL;
2348c2ecf20Sopenharmony_ci
2358c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_lock(ci);
2368c2ecf20Sopenharmony_ci
2378c2ecf20Sopenharmony_ci	trace_ocfs2_buffer_cached_begin(
2388c2ecf20Sopenharmony_ci		(unsigned long long)ocfs2_metadata_cache_owner(ci),
2398c2ecf20Sopenharmony_ci		(unsigned long long) bh->b_blocknr,
2408c2ecf20Sopenharmony_ci		!!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
2418c2ecf20Sopenharmony_ci
2428c2ecf20Sopenharmony_ci	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
2438c2ecf20Sopenharmony_ci		index = ocfs2_search_cache_array(ci, bh->b_blocknr);
2448c2ecf20Sopenharmony_ci	else
2458c2ecf20Sopenharmony_ci		item = ocfs2_search_cache_tree(ci, bh->b_blocknr);
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_unlock(ci);
2488c2ecf20Sopenharmony_ci
2498c2ecf20Sopenharmony_ci	trace_ocfs2_buffer_cached_end(index, item);
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci	return (index != -1) || (item != NULL);
2528c2ecf20Sopenharmony_ci}
2538c2ecf20Sopenharmony_ci
2548c2ecf20Sopenharmony_ci/* Warning: even if it returns true, this does *not* guarantee that
2558c2ecf20Sopenharmony_ci * the block is stored in our inode metadata cache.
2568c2ecf20Sopenharmony_ci *
2578c2ecf20Sopenharmony_ci * This can be called under lock_buffer()
2588c2ecf20Sopenharmony_ci */
2598c2ecf20Sopenharmony_ciint ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
2608c2ecf20Sopenharmony_ci			  struct buffer_head *bh)
2618c2ecf20Sopenharmony_ci{
2628c2ecf20Sopenharmony_ci	/* Doesn't matter if the bh is in our cache or not -- if it's
2638c2ecf20Sopenharmony_ci	 * not marked uptodate then we know it can't have correct
2648c2ecf20Sopenharmony_ci	 * data. */
2658c2ecf20Sopenharmony_ci	if (!buffer_uptodate(bh))
2668c2ecf20Sopenharmony_ci		return 0;
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_ci	/* OCFS2 does not allow multiple nodes to be changing the same
2698c2ecf20Sopenharmony_ci	 * block at the same time. */
2708c2ecf20Sopenharmony_ci	if (buffer_jbd(bh))
2718c2ecf20Sopenharmony_ci		return 1;
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_ci	/* Ok, locally the buffer is marked as up to date, now search
2748c2ecf20Sopenharmony_ci	 * our cache to see if we can trust that. */
2758c2ecf20Sopenharmony_ci	return ocfs2_buffer_cached(ci, bh);
2768c2ecf20Sopenharmony_ci}
2778c2ecf20Sopenharmony_ci
2788c2ecf20Sopenharmony_ci/*
2798c2ecf20Sopenharmony_ci * Determine whether a buffer is currently out on a read-ahead request.
2808c2ecf20Sopenharmony_ci * ci_io_sem should be held to serialize submitters with the logic here.
2818c2ecf20Sopenharmony_ci */
2828c2ecf20Sopenharmony_ciint ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
2838c2ecf20Sopenharmony_ci			    struct buffer_head *bh)
2848c2ecf20Sopenharmony_ci{
2858c2ecf20Sopenharmony_ci	return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh);
2868c2ecf20Sopenharmony_ci}
2878c2ecf20Sopenharmony_ci
2888c2ecf20Sopenharmony_ci/* Requires ip_lock */
2898c2ecf20Sopenharmony_cistatic void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
2908c2ecf20Sopenharmony_ci				     sector_t block)
2918c2ecf20Sopenharmony_ci{
2928c2ecf20Sopenharmony_ci	BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
2938c2ecf20Sopenharmony_ci
2948c2ecf20Sopenharmony_ci	trace_ocfs2_append_cache_array(
2958c2ecf20Sopenharmony_ci		(unsigned long long)ocfs2_metadata_cache_owner(ci),
2968c2ecf20Sopenharmony_ci		(unsigned long long)block, ci->ci_num_cached);
2978c2ecf20Sopenharmony_ci
2988c2ecf20Sopenharmony_ci	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
2998c2ecf20Sopenharmony_ci	ci->ci_num_cached++;
3008c2ecf20Sopenharmony_ci}
3018c2ecf20Sopenharmony_ci
3028c2ecf20Sopenharmony_ci/* By now the caller should have checked that the item does *not*
3038c2ecf20Sopenharmony_ci * exist in the tree.
3048c2ecf20Sopenharmony_ci * Requires ip_lock. */
3058c2ecf20Sopenharmony_cistatic void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
3068c2ecf20Sopenharmony_ci				      struct ocfs2_meta_cache_item *new)
3078c2ecf20Sopenharmony_ci{
3088c2ecf20Sopenharmony_ci	sector_t block = new->c_block;
3098c2ecf20Sopenharmony_ci	struct rb_node *parent = NULL;
3108c2ecf20Sopenharmony_ci	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
3118c2ecf20Sopenharmony_ci	struct ocfs2_meta_cache_item *tmp;
3128c2ecf20Sopenharmony_ci
3138c2ecf20Sopenharmony_ci	trace_ocfs2_insert_cache_tree(
3148c2ecf20Sopenharmony_ci		(unsigned long long)ocfs2_metadata_cache_owner(ci),
3158c2ecf20Sopenharmony_ci		(unsigned long long)block, ci->ci_num_cached);
3168c2ecf20Sopenharmony_ci
3178c2ecf20Sopenharmony_ci	while(*p) {
3188c2ecf20Sopenharmony_ci		parent = *p;
3198c2ecf20Sopenharmony_ci
3208c2ecf20Sopenharmony_ci		tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
3218c2ecf20Sopenharmony_ci
3228c2ecf20Sopenharmony_ci		if (block < tmp->c_block)
3238c2ecf20Sopenharmony_ci			p = &(*p)->rb_left;
3248c2ecf20Sopenharmony_ci		else if (block > tmp->c_block)
3258c2ecf20Sopenharmony_ci			p = &(*p)->rb_right;
3268c2ecf20Sopenharmony_ci		else {
3278c2ecf20Sopenharmony_ci			/* This should never happen! */
3288c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "Duplicate block %llu cached!\n",
3298c2ecf20Sopenharmony_ci			     (unsigned long long) block);
3308c2ecf20Sopenharmony_ci			BUG();
3318c2ecf20Sopenharmony_ci		}
3328c2ecf20Sopenharmony_ci	}
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ci	rb_link_node(&new->c_node, parent, p);
3358c2ecf20Sopenharmony_ci	rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
3368c2ecf20Sopenharmony_ci	ci->ci_num_cached++;
3378c2ecf20Sopenharmony_ci}
3388c2ecf20Sopenharmony_ci
3398c2ecf20Sopenharmony_ci/* co_cache_lock() must be held */
3408c2ecf20Sopenharmony_cistatic inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci)
3418c2ecf20Sopenharmony_ci{
3428c2ecf20Sopenharmony_ci	return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) &&
3438c2ecf20Sopenharmony_ci		(ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY);
3448c2ecf20Sopenharmony_ci}
3458c2ecf20Sopenharmony_ci
3468c2ecf20Sopenharmony_ci/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the
3478c2ecf20Sopenharmony_ci * pointers in tree after we use them - this allows caller to detect
3488c2ecf20Sopenharmony_ci * when to free in case of error.
3498c2ecf20Sopenharmony_ci *
3508c2ecf20Sopenharmony_ci * The co_cache_lock() must be held. */
3518c2ecf20Sopenharmony_cistatic void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
3528c2ecf20Sopenharmony_ci			       struct ocfs2_meta_cache_item **tree)
3538c2ecf20Sopenharmony_ci{
3548c2ecf20Sopenharmony_ci	int i;
3558c2ecf20Sopenharmony_ci
3568c2ecf20Sopenharmony_ci	mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY,
3578c2ecf20Sopenharmony_ci			"Owner %llu, num cached = %u, should be %u\n",
3588c2ecf20Sopenharmony_ci			(unsigned long long)ocfs2_metadata_cache_owner(ci),
3598c2ecf20Sopenharmony_ci			ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY);
3608c2ecf20Sopenharmony_ci	mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE),
3618c2ecf20Sopenharmony_ci			"Owner %llu not marked as inline anymore!\n",
3628c2ecf20Sopenharmony_ci			(unsigned long long)ocfs2_metadata_cache_owner(ci));
3638c2ecf20Sopenharmony_ci
3648c2ecf20Sopenharmony_ci	/* Be careful to initialize the tree members *first* because
3658c2ecf20Sopenharmony_ci	 * once the ci_tree is used, the array is junk... */
3668c2ecf20Sopenharmony_ci	for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
3678c2ecf20Sopenharmony_ci		tree[i]->c_block = ci->ci_cache.ci_array[i];
3688c2ecf20Sopenharmony_ci
3698c2ecf20Sopenharmony_ci	ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE;
3708c2ecf20Sopenharmony_ci	ci->ci_cache.ci_tree = RB_ROOT;
3718c2ecf20Sopenharmony_ci	/* this will be set again by __ocfs2_insert_cache_tree */
3728c2ecf20Sopenharmony_ci	ci->ci_num_cached = 0;
3738c2ecf20Sopenharmony_ci
3748c2ecf20Sopenharmony_ci	for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
3758c2ecf20Sopenharmony_ci		__ocfs2_insert_cache_tree(ci, tree[i]);
3768c2ecf20Sopenharmony_ci		tree[i] = NULL;
3778c2ecf20Sopenharmony_ci	}
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_ci	trace_ocfs2_expand_cache(
3808c2ecf20Sopenharmony_ci		(unsigned long long)ocfs2_metadata_cache_owner(ci),
3818c2ecf20Sopenharmony_ci		ci->ci_flags, ci->ci_num_cached);
3828c2ecf20Sopenharmony_ci}
3838c2ecf20Sopenharmony_ci
3848c2ecf20Sopenharmony_ci/* Slow path function - memory allocation is necessary. See the
3858c2ecf20Sopenharmony_ci * comment above ocfs2_set_buffer_uptodate for more information. */
3868c2ecf20Sopenharmony_cistatic void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
3878c2ecf20Sopenharmony_ci					sector_t block,
3888c2ecf20Sopenharmony_ci					int expand_tree)
3898c2ecf20Sopenharmony_ci{
3908c2ecf20Sopenharmony_ci	int i;
3918c2ecf20Sopenharmony_ci	struct ocfs2_meta_cache_item *new = NULL;
3928c2ecf20Sopenharmony_ci	struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
3938c2ecf20Sopenharmony_ci		{ NULL, };
3948c2ecf20Sopenharmony_ci
3958c2ecf20Sopenharmony_ci	trace_ocfs2_set_buffer_uptodate(
3968c2ecf20Sopenharmony_ci		(unsigned long long)ocfs2_metadata_cache_owner(ci),
3978c2ecf20Sopenharmony_ci		(unsigned long long)block, expand_tree);
3988c2ecf20Sopenharmony_ci
3998c2ecf20Sopenharmony_ci	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
4008c2ecf20Sopenharmony_ci	if (!new) {
4018c2ecf20Sopenharmony_ci		mlog_errno(-ENOMEM);
4028c2ecf20Sopenharmony_ci		return;
4038c2ecf20Sopenharmony_ci	}
4048c2ecf20Sopenharmony_ci	new->c_block = block;
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci	if (expand_tree) {
4078c2ecf20Sopenharmony_ci		/* Do *not* allocate an array here - the removal code
4088c2ecf20Sopenharmony_ci		 * has no way of tracking that. */
4098c2ecf20Sopenharmony_ci		for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
4108c2ecf20Sopenharmony_ci			tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
4118c2ecf20Sopenharmony_ci						   GFP_NOFS);
4128c2ecf20Sopenharmony_ci			if (!tree[i]) {
4138c2ecf20Sopenharmony_ci				mlog_errno(-ENOMEM);
4148c2ecf20Sopenharmony_ci				goto out_free;
4158c2ecf20Sopenharmony_ci			}
4168c2ecf20Sopenharmony_ci
4178c2ecf20Sopenharmony_ci			/* These are initialized in ocfs2_expand_cache! */
4188c2ecf20Sopenharmony_ci		}
4198c2ecf20Sopenharmony_ci	}
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_lock(ci);
4228c2ecf20Sopenharmony_ci	if (ocfs2_insert_can_use_array(ci)) {
4238c2ecf20Sopenharmony_ci		/* Ok, items were removed from the cache in between
4248c2ecf20Sopenharmony_ci		 * locks. Detect this and revert back to the fast path */
4258c2ecf20Sopenharmony_ci		ocfs2_append_cache_array(ci, block);
4268c2ecf20Sopenharmony_ci		ocfs2_metadata_cache_unlock(ci);
4278c2ecf20Sopenharmony_ci		goto out_free;
4288c2ecf20Sopenharmony_ci	}
4298c2ecf20Sopenharmony_ci
4308c2ecf20Sopenharmony_ci	if (expand_tree)
4318c2ecf20Sopenharmony_ci		ocfs2_expand_cache(ci, tree);
4328c2ecf20Sopenharmony_ci
4338c2ecf20Sopenharmony_ci	__ocfs2_insert_cache_tree(ci, new);
4348c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_unlock(ci);
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_ci	new = NULL;
4378c2ecf20Sopenharmony_ciout_free:
4388c2ecf20Sopenharmony_ci	if (new)
4398c2ecf20Sopenharmony_ci		kmem_cache_free(ocfs2_uptodate_cachep, new);
4408c2ecf20Sopenharmony_ci
4418c2ecf20Sopenharmony_ci	/* If these were used, then ocfs2_expand_cache re-set them to
4428c2ecf20Sopenharmony_ci	 * NULL for us. */
4438c2ecf20Sopenharmony_ci	if (tree[0]) {
4448c2ecf20Sopenharmony_ci		for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
4458c2ecf20Sopenharmony_ci			if (tree[i])
4468c2ecf20Sopenharmony_ci				kmem_cache_free(ocfs2_uptodate_cachep,
4478c2ecf20Sopenharmony_ci						tree[i]);
4488c2ecf20Sopenharmony_ci	}
4498c2ecf20Sopenharmony_ci}
4508c2ecf20Sopenharmony_ci
4518c2ecf20Sopenharmony_ci/* Item insertion is guarded by co_io_lock(), so the insertion path takes
4528c2ecf20Sopenharmony_ci * advantage of this by not rechecking for a duplicate insert during
4538c2ecf20Sopenharmony_ci * the slow case. Additionally, if the cache needs to be bumped up to
4548c2ecf20Sopenharmony_ci * a tree, the code will not recheck after acquiring the lock --
4558c2ecf20Sopenharmony_ci * multiple paths cannot be expanding to a tree at the same time.
4568c2ecf20Sopenharmony_ci *
4578c2ecf20Sopenharmony_ci * The slow path takes into account that items can be removed
4588c2ecf20Sopenharmony_ci * (including the whole tree wiped and reset) when this process it out
4598c2ecf20Sopenharmony_ci * allocating memory. In those cases, it reverts back to the fast
4608c2ecf20Sopenharmony_ci * path.
4618c2ecf20Sopenharmony_ci *
4628c2ecf20Sopenharmony_ci * Note that this function may actually fail to insert the block if
4638c2ecf20Sopenharmony_ci * memory cannot be allocated. This is not fatal however (but may
4648c2ecf20Sopenharmony_ci * result in a performance penalty)
4658c2ecf20Sopenharmony_ci *
4668c2ecf20Sopenharmony_ci * Readahead buffers can be passed in here before the I/O request is
4678c2ecf20Sopenharmony_ci * completed.
4688c2ecf20Sopenharmony_ci */
4698c2ecf20Sopenharmony_civoid ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
4708c2ecf20Sopenharmony_ci			       struct buffer_head *bh)
4718c2ecf20Sopenharmony_ci{
4728c2ecf20Sopenharmony_ci	int expand;
4738c2ecf20Sopenharmony_ci
4748c2ecf20Sopenharmony_ci	/* The block may very well exist in our cache already, so avoid
4758c2ecf20Sopenharmony_ci	 * doing any more work in that case. */
4768c2ecf20Sopenharmony_ci	if (ocfs2_buffer_cached(ci, bh))
4778c2ecf20Sopenharmony_ci		return;
4788c2ecf20Sopenharmony_ci
4798c2ecf20Sopenharmony_ci	trace_ocfs2_set_buffer_uptodate_begin(
4808c2ecf20Sopenharmony_ci		(unsigned long long)ocfs2_metadata_cache_owner(ci),
4818c2ecf20Sopenharmony_ci		(unsigned long long)bh->b_blocknr);
4828c2ecf20Sopenharmony_ci
4838c2ecf20Sopenharmony_ci	/* No need to recheck under spinlock - insertion is guarded by
4848c2ecf20Sopenharmony_ci	 * co_io_lock() */
4858c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_lock(ci);
4868c2ecf20Sopenharmony_ci	if (ocfs2_insert_can_use_array(ci)) {
4878c2ecf20Sopenharmony_ci		/* Fast case - it's an array and there's a free
4888c2ecf20Sopenharmony_ci		 * spot. */
4898c2ecf20Sopenharmony_ci		ocfs2_append_cache_array(ci, bh->b_blocknr);
4908c2ecf20Sopenharmony_ci		ocfs2_metadata_cache_unlock(ci);
4918c2ecf20Sopenharmony_ci		return;
4928c2ecf20Sopenharmony_ci	}
4938c2ecf20Sopenharmony_ci
4948c2ecf20Sopenharmony_ci	expand = 0;
4958c2ecf20Sopenharmony_ci	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
4968c2ecf20Sopenharmony_ci		/* We need to bump things up to a tree. */
4978c2ecf20Sopenharmony_ci		expand = 1;
4988c2ecf20Sopenharmony_ci	}
4998c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_unlock(ci);
5008c2ecf20Sopenharmony_ci
5018c2ecf20Sopenharmony_ci	__ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand);
5028c2ecf20Sopenharmony_ci}
5038c2ecf20Sopenharmony_ci
5048c2ecf20Sopenharmony_ci/* Called against a newly allocated buffer. Most likely nobody should
5058c2ecf20Sopenharmony_ci * be able to read this sort of metadata while it's still being
5068c2ecf20Sopenharmony_ci * allocated, but this is careful to take co_io_lock() anyway. */
5078c2ecf20Sopenharmony_civoid ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
5088c2ecf20Sopenharmony_ci				   struct buffer_head *bh)
5098c2ecf20Sopenharmony_ci{
5108c2ecf20Sopenharmony_ci	/* This should definitely *not* exist in our cache */
5118c2ecf20Sopenharmony_ci	BUG_ON(ocfs2_buffer_cached(ci, bh));
5128c2ecf20Sopenharmony_ci
5138c2ecf20Sopenharmony_ci	set_buffer_uptodate(bh);
5148c2ecf20Sopenharmony_ci
5158c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_io_lock(ci);
5168c2ecf20Sopenharmony_ci	ocfs2_set_buffer_uptodate(ci, bh);
5178c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_io_unlock(ci);
5188c2ecf20Sopenharmony_ci}
5198c2ecf20Sopenharmony_ci
5208c2ecf20Sopenharmony_ci/* Requires ip_lock. */
5218c2ecf20Sopenharmony_cistatic void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
5228c2ecf20Sopenharmony_ci					int index)
5238c2ecf20Sopenharmony_ci{
5248c2ecf20Sopenharmony_ci	sector_t *array = ci->ci_cache.ci_array;
5258c2ecf20Sopenharmony_ci	int bytes;
5268c2ecf20Sopenharmony_ci
5278c2ecf20Sopenharmony_ci	BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY);
5288c2ecf20Sopenharmony_ci	BUG_ON(index >= ci->ci_num_cached);
5298c2ecf20Sopenharmony_ci	BUG_ON(!ci->ci_num_cached);
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci	trace_ocfs2_remove_metadata_array(
5328c2ecf20Sopenharmony_ci		(unsigned long long)ocfs2_metadata_cache_owner(ci),
5338c2ecf20Sopenharmony_ci		index, ci->ci_num_cached);
5348c2ecf20Sopenharmony_ci
5358c2ecf20Sopenharmony_ci	ci->ci_num_cached--;
5368c2ecf20Sopenharmony_ci
5378c2ecf20Sopenharmony_ci	/* don't need to copy if the array is now empty, or if we
5388c2ecf20Sopenharmony_ci	 * removed at the tail */
5398c2ecf20Sopenharmony_ci	if (ci->ci_num_cached && index < ci->ci_num_cached) {
5408c2ecf20Sopenharmony_ci		bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
5418c2ecf20Sopenharmony_ci		memmove(&array[index], &array[index + 1], bytes);
5428c2ecf20Sopenharmony_ci	}
5438c2ecf20Sopenharmony_ci}
5448c2ecf20Sopenharmony_ci
5458c2ecf20Sopenharmony_ci/* Requires ip_lock. */
5468c2ecf20Sopenharmony_cistatic void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
5478c2ecf20Sopenharmony_ci				       struct ocfs2_meta_cache_item *item)
5488c2ecf20Sopenharmony_ci{
5498c2ecf20Sopenharmony_ci	trace_ocfs2_remove_metadata_tree(
5508c2ecf20Sopenharmony_ci		(unsigned long long)ocfs2_metadata_cache_owner(ci),
5518c2ecf20Sopenharmony_ci		(unsigned long long)item->c_block);
5528c2ecf20Sopenharmony_ci
5538c2ecf20Sopenharmony_ci	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
5548c2ecf20Sopenharmony_ci	ci->ci_num_cached--;
5558c2ecf20Sopenharmony_ci}
5568c2ecf20Sopenharmony_ci
5578c2ecf20Sopenharmony_cistatic void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
5588c2ecf20Sopenharmony_ci					  sector_t block)
5598c2ecf20Sopenharmony_ci{
5608c2ecf20Sopenharmony_ci	int index;
5618c2ecf20Sopenharmony_ci	struct ocfs2_meta_cache_item *item = NULL;
5628c2ecf20Sopenharmony_ci
5638c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_lock(ci);
5648c2ecf20Sopenharmony_ci	trace_ocfs2_remove_block_from_cache(
5658c2ecf20Sopenharmony_ci		(unsigned long long)ocfs2_metadata_cache_owner(ci),
5668c2ecf20Sopenharmony_ci		(unsigned long long) block, ci->ci_num_cached,
5678c2ecf20Sopenharmony_ci		ci->ci_flags);
5688c2ecf20Sopenharmony_ci
5698c2ecf20Sopenharmony_ci	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
5708c2ecf20Sopenharmony_ci		index = ocfs2_search_cache_array(ci, block);
5718c2ecf20Sopenharmony_ci		if (index != -1)
5728c2ecf20Sopenharmony_ci			ocfs2_remove_metadata_array(ci, index);
5738c2ecf20Sopenharmony_ci	} else {
5748c2ecf20Sopenharmony_ci		item = ocfs2_search_cache_tree(ci, block);
5758c2ecf20Sopenharmony_ci		if (item)
5768c2ecf20Sopenharmony_ci			ocfs2_remove_metadata_tree(ci, item);
5778c2ecf20Sopenharmony_ci	}
5788c2ecf20Sopenharmony_ci	ocfs2_metadata_cache_unlock(ci);
5798c2ecf20Sopenharmony_ci
5808c2ecf20Sopenharmony_ci	if (item)
5818c2ecf20Sopenharmony_ci		kmem_cache_free(ocfs2_uptodate_cachep, item);
5828c2ecf20Sopenharmony_ci}
5838c2ecf20Sopenharmony_ci
5848c2ecf20Sopenharmony_ci/*
5858c2ecf20Sopenharmony_ci * Called when we remove a chunk of metadata from an inode. We don't
5868c2ecf20Sopenharmony_ci * bother reverting things to an inlined array in the case of a remove
5878c2ecf20Sopenharmony_ci * which moves us back under the limit.
5888c2ecf20Sopenharmony_ci */
5898c2ecf20Sopenharmony_civoid ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
5908c2ecf20Sopenharmony_ci			     struct buffer_head *bh)
5918c2ecf20Sopenharmony_ci{
5928c2ecf20Sopenharmony_ci	sector_t block = bh->b_blocknr;
5938c2ecf20Sopenharmony_ci
5948c2ecf20Sopenharmony_ci	ocfs2_remove_block_from_cache(ci, block);
5958c2ecf20Sopenharmony_ci}
5968c2ecf20Sopenharmony_ci
5978c2ecf20Sopenharmony_ci/* Called when we remove xattr clusters from an inode. */
5988c2ecf20Sopenharmony_civoid ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
5998c2ecf20Sopenharmony_ci					    sector_t block,
6008c2ecf20Sopenharmony_ci					    u32 c_len)
6018c2ecf20Sopenharmony_ci{
6028c2ecf20Sopenharmony_ci	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
6038c2ecf20Sopenharmony_ci	unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len;
6048c2ecf20Sopenharmony_ci
6058c2ecf20Sopenharmony_ci	for (i = 0; i < b_len; i++, block++)
6068c2ecf20Sopenharmony_ci		ocfs2_remove_block_from_cache(ci, block);
6078c2ecf20Sopenharmony_ci}
6088c2ecf20Sopenharmony_ci
6098c2ecf20Sopenharmony_ciint __init init_ocfs2_uptodate_cache(void)
6108c2ecf20Sopenharmony_ci{
6118c2ecf20Sopenharmony_ci	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
6128c2ecf20Sopenharmony_ci				  sizeof(struct ocfs2_meta_cache_item),
6138c2ecf20Sopenharmony_ci				  0, SLAB_HWCACHE_ALIGN, NULL);
6148c2ecf20Sopenharmony_ci	if (!ocfs2_uptodate_cachep)
6158c2ecf20Sopenharmony_ci		return -ENOMEM;
6168c2ecf20Sopenharmony_ci
6178c2ecf20Sopenharmony_ci	return 0;
6188c2ecf20Sopenharmony_ci}
6198c2ecf20Sopenharmony_ci
6208c2ecf20Sopenharmony_civoid exit_ocfs2_uptodate_cache(void)
6218c2ecf20Sopenharmony_ci{
6228c2ecf20Sopenharmony_ci	kmem_cache_destroy(ocfs2_uptodate_cachep);
6238c2ecf20Sopenharmony_ci}
624