162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2021-2023 Oracle.  All Rights Reserved.
462306a36Sopenharmony_ci * Author: Darrick J. Wong <djwong@kernel.org>
562306a36Sopenharmony_ci */
662306a36Sopenharmony_ci#include "xfs.h"
762306a36Sopenharmony_ci#include "xfs_fs.h"
862306a36Sopenharmony_ci#include "xfs_shared.h"
962306a36Sopenharmony_ci#include "xfs_format.h"
1062306a36Sopenharmony_ci#include "scrub/xfile.h"
1162306a36Sopenharmony_ci#include "scrub/xfarray.h"
1262306a36Sopenharmony_ci#include "scrub/scrub.h"
1362306a36Sopenharmony_ci#include "scrub/trace.h"
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci/*
1662306a36Sopenharmony_ci * Large Arrays of Fixed-Size Records
1762306a36Sopenharmony_ci * ==================================
1862306a36Sopenharmony_ci *
1962306a36Sopenharmony_ci * This memory array uses an xfile (which itself is a memfd "file") to store
2062306a36Sopenharmony_ci * large numbers of fixed-size records in memory that can be paged out.  This
2162306a36Sopenharmony_ci * puts less stress on the memory reclaim algorithms during an online repair
2262306a36Sopenharmony_ci * because we don't have to pin so much memory.  However, array access is less
2362306a36Sopenharmony_ci * direct than would be in a regular memory array.  Access to the array is
2462306a36Sopenharmony_ci * performed via indexed load and store methods, and an append method is
2562306a36Sopenharmony_ci * provided for convenience.  Array elements can be unset, which sets them to
2662306a36Sopenharmony_ci * all zeroes.  Unset entries are skipped during iteration, though direct loads
2762306a36Sopenharmony_ci * will return a zeroed buffer.  Callers are responsible for concurrency
2862306a36Sopenharmony_ci * control.
2962306a36Sopenharmony_ci */
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_ci/*
3262306a36Sopenharmony_ci * Pointer to scratch space.  Because we can't access the xfile data directly,
3362306a36Sopenharmony_ci * we allocate a small amount of memory on the end of the xfarray structure to
3462306a36Sopenharmony_ci * buffer array items when we need space to store values temporarily.
3562306a36Sopenharmony_ci */
3662306a36Sopenharmony_cistatic inline void *xfarray_scratch(struct xfarray *array)
3762306a36Sopenharmony_ci{
3862306a36Sopenharmony_ci	return (array + 1);
3962306a36Sopenharmony_ci}
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ci/* Compute array index given an xfile offset. */
4262306a36Sopenharmony_cistatic xfarray_idx_t
4362306a36Sopenharmony_cixfarray_idx(
4462306a36Sopenharmony_ci	struct xfarray	*array,
4562306a36Sopenharmony_ci	loff_t		pos)
4662306a36Sopenharmony_ci{
4762306a36Sopenharmony_ci	if (array->obj_size_log >= 0)
4862306a36Sopenharmony_ci		return (xfarray_idx_t)pos >> array->obj_size_log;
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci	return div_u64((xfarray_idx_t)pos, array->obj_size);
5162306a36Sopenharmony_ci}
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci/* Compute xfile offset of array element. */
5462306a36Sopenharmony_cistatic inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx)
5562306a36Sopenharmony_ci{
5662306a36Sopenharmony_ci	if (array->obj_size_log >= 0)
5762306a36Sopenharmony_ci		return idx << array->obj_size_log;
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci	return idx * array->obj_size;
6062306a36Sopenharmony_ci}
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_ci/*
6362306a36Sopenharmony_ci * Initialize a big memory array.  Array records cannot be larger than a
6462306a36Sopenharmony_ci * page, and the array cannot span more bytes than the page cache supports.
6562306a36Sopenharmony_ci * If @required_capacity is nonzero, the maximum array size will be set to this
6662306a36Sopenharmony_ci * quantity and the array creation will fail if the underlying storage cannot
6762306a36Sopenharmony_ci * support that many records.
6862306a36Sopenharmony_ci */
6962306a36Sopenharmony_ciint
7062306a36Sopenharmony_cixfarray_create(
7162306a36Sopenharmony_ci	const char		*description,
7262306a36Sopenharmony_ci	unsigned long long	required_capacity,
7362306a36Sopenharmony_ci	size_t			obj_size,
7462306a36Sopenharmony_ci	struct xfarray		**arrayp)
7562306a36Sopenharmony_ci{
7662306a36Sopenharmony_ci	struct xfarray		*array;
7762306a36Sopenharmony_ci	struct xfile		*xfile;
7862306a36Sopenharmony_ci	int			error;
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci	ASSERT(obj_size < PAGE_SIZE);
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci	error = xfile_create(description, 0, &xfile);
8362306a36Sopenharmony_ci	if (error)
8462306a36Sopenharmony_ci		return error;
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_ci	error = -ENOMEM;
8762306a36Sopenharmony_ci	array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS);
8862306a36Sopenharmony_ci	if (!array)
8962306a36Sopenharmony_ci		goto out_xfile;
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	array->xfile = xfile;
9262306a36Sopenharmony_ci	array->obj_size = obj_size;
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci	if (is_power_of_2(obj_size))
9562306a36Sopenharmony_ci		array->obj_size_log = ilog2(obj_size);
9662306a36Sopenharmony_ci	else
9762306a36Sopenharmony_ci		array->obj_size_log = -1;
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci	array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE);
10062306a36Sopenharmony_ci	trace_xfarray_create(array, required_capacity);
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_ci	if (required_capacity > 0) {
10362306a36Sopenharmony_ci		if (array->max_nr < required_capacity) {
10462306a36Sopenharmony_ci			error = -ENOMEM;
10562306a36Sopenharmony_ci			goto out_xfarray;
10662306a36Sopenharmony_ci		}
10762306a36Sopenharmony_ci		array->max_nr = required_capacity;
10862306a36Sopenharmony_ci	}
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci	*arrayp = array;
11162306a36Sopenharmony_ci	return 0;
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ciout_xfarray:
11462306a36Sopenharmony_ci	kfree(array);
11562306a36Sopenharmony_ciout_xfile:
11662306a36Sopenharmony_ci	xfile_destroy(xfile);
11762306a36Sopenharmony_ci	return error;
11862306a36Sopenharmony_ci}
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci/* Destroy the array. */
12162306a36Sopenharmony_civoid
12262306a36Sopenharmony_cixfarray_destroy(
12362306a36Sopenharmony_ci	struct xfarray	*array)
12462306a36Sopenharmony_ci{
12562306a36Sopenharmony_ci	xfile_destroy(array->xfile);
12662306a36Sopenharmony_ci	kfree(array);
12762306a36Sopenharmony_ci}
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci/* Load an element from the array. */
13062306a36Sopenharmony_ciint
13162306a36Sopenharmony_cixfarray_load(
13262306a36Sopenharmony_ci	struct xfarray	*array,
13362306a36Sopenharmony_ci	xfarray_idx_t	idx,
13462306a36Sopenharmony_ci	void		*ptr)
13562306a36Sopenharmony_ci{
13662306a36Sopenharmony_ci	if (idx >= array->nr)
13762306a36Sopenharmony_ci		return -ENODATA;
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci	return xfile_obj_load(array->xfile, ptr, array->obj_size,
14062306a36Sopenharmony_ci			xfarray_pos(array, idx));
14162306a36Sopenharmony_ci}
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ci/* Is this array element potentially unset? */
14462306a36Sopenharmony_cistatic inline bool
14562306a36Sopenharmony_cixfarray_is_unset(
14662306a36Sopenharmony_ci	struct xfarray	*array,
14762306a36Sopenharmony_ci	loff_t		pos)
14862306a36Sopenharmony_ci{
14962306a36Sopenharmony_ci	void		*temp = xfarray_scratch(array);
15062306a36Sopenharmony_ci	int		error;
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci	if (array->unset_slots == 0)
15362306a36Sopenharmony_ci		return false;
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
15662306a36Sopenharmony_ci	if (!error && xfarray_element_is_null(array, temp))
15762306a36Sopenharmony_ci		return true;
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci	return false;
16062306a36Sopenharmony_ci}
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci/*
16362306a36Sopenharmony_ci * Unset an array element.  If @idx is the last element in the array, the
16462306a36Sopenharmony_ci * array will be truncated.  Otherwise, the entry will be zeroed.
16562306a36Sopenharmony_ci */
16662306a36Sopenharmony_ciint
16762306a36Sopenharmony_cixfarray_unset(
16862306a36Sopenharmony_ci	struct xfarray	*array,
16962306a36Sopenharmony_ci	xfarray_idx_t	idx)
17062306a36Sopenharmony_ci{
17162306a36Sopenharmony_ci	void		*temp = xfarray_scratch(array);
17262306a36Sopenharmony_ci	loff_t		pos = xfarray_pos(array, idx);
17362306a36Sopenharmony_ci	int		error;
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci	if (idx >= array->nr)
17662306a36Sopenharmony_ci		return -ENODATA;
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	if (idx == array->nr - 1) {
17962306a36Sopenharmony_ci		array->nr--;
18062306a36Sopenharmony_ci		return 0;
18162306a36Sopenharmony_ci	}
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_ci	if (xfarray_is_unset(array, pos))
18462306a36Sopenharmony_ci		return 0;
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci	memset(temp, 0, array->obj_size);
18762306a36Sopenharmony_ci	error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
18862306a36Sopenharmony_ci	if (error)
18962306a36Sopenharmony_ci		return error;
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci	array->unset_slots++;
19262306a36Sopenharmony_ci	return 0;
19362306a36Sopenharmony_ci}
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci/*
19662306a36Sopenharmony_ci * Store an element in the array.  The element must not be completely zeroed,
19762306a36Sopenharmony_ci * because those are considered unset sparse elements.
19862306a36Sopenharmony_ci */
19962306a36Sopenharmony_ciint
20062306a36Sopenharmony_cixfarray_store(
20162306a36Sopenharmony_ci	struct xfarray	*array,
20262306a36Sopenharmony_ci	xfarray_idx_t	idx,
20362306a36Sopenharmony_ci	const void	*ptr)
20462306a36Sopenharmony_ci{
20562306a36Sopenharmony_ci	int		ret;
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	if (idx >= array->max_nr)
20862306a36Sopenharmony_ci		return -EFBIG;
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	ASSERT(!xfarray_element_is_null(array, ptr));
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
21362306a36Sopenharmony_ci			xfarray_pos(array, idx));
21462306a36Sopenharmony_ci	if (ret)
21562306a36Sopenharmony_ci		return ret;
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	array->nr = max(array->nr, idx + 1);
21862306a36Sopenharmony_ci	return 0;
21962306a36Sopenharmony_ci}
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci/* Is this array element NULL? */
22262306a36Sopenharmony_cibool
22362306a36Sopenharmony_cixfarray_element_is_null(
22462306a36Sopenharmony_ci	struct xfarray	*array,
22562306a36Sopenharmony_ci	const void	*ptr)
22662306a36Sopenharmony_ci{
22762306a36Sopenharmony_ci	return !memchr_inv(ptr, 0, array->obj_size);
22862306a36Sopenharmony_ci}
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ci/*
23162306a36Sopenharmony_ci * Store an element anywhere in the array that is unset.  If there are no
23262306a36Sopenharmony_ci * unset slots, append the element to the array.
23362306a36Sopenharmony_ci */
23462306a36Sopenharmony_ciint
23562306a36Sopenharmony_cixfarray_store_anywhere(
23662306a36Sopenharmony_ci	struct xfarray	*array,
23762306a36Sopenharmony_ci	const void	*ptr)
23862306a36Sopenharmony_ci{
23962306a36Sopenharmony_ci	void		*temp = xfarray_scratch(array);
24062306a36Sopenharmony_ci	loff_t		endpos = xfarray_pos(array, array->nr);
24162306a36Sopenharmony_ci	loff_t		pos;
24262306a36Sopenharmony_ci	int		error;
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	/* Find an unset slot to put it in. */
24562306a36Sopenharmony_ci	for (pos = 0;
24662306a36Sopenharmony_ci	     pos < endpos && array->unset_slots > 0;
24762306a36Sopenharmony_ci	     pos += array->obj_size) {
24862306a36Sopenharmony_ci		error = xfile_obj_load(array->xfile, temp, array->obj_size,
24962306a36Sopenharmony_ci				pos);
25062306a36Sopenharmony_ci		if (error || !xfarray_element_is_null(array, temp))
25162306a36Sopenharmony_ci			continue;
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci		error = xfile_obj_store(array->xfile, ptr, array->obj_size,
25462306a36Sopenharmony_ci				pos);
25562306a36Sopenharmony_ci		if (error)
25662306a36Sopenharmony_ci			return error;
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ci		array->unset_slots--;
25962306a36Sopenharmony_ci		return 0;
26062306a36Sopenharmony_ci	}
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ci	/* No unset slots found; attach it on the end. */
26362306a36Sopenharmony_ci	array->unset_slots = 0;
26462306a36Sopenharmony_ci	return xfarray_append(array, ptr);
26562306a36Sopenharmony_ci}
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci/* Return length of array. */
26862306a36Sopenharmony_ciuint64_t
26962306a36Sopenharmony_cixfarray_length(
27062306a36Sopenharmony_ci	struct xfarray	*array)
27162306a36Sopenharmony_ci{
27262306a36Sopenharmony_ci	return array->nr;
27362306a36Sopenharmony_ci}
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci/*
27662306a36Sopenharmony_ci * Decide which array item we're going to read as part of an _iter_get.
27762306a36Sopenharmony_ci * @cur is the array index, and @pos is the file offset of that array index in
27862306a36Sopenharmony_ci * the backing xfile.  Returns ENODATA if we reach the end of the records.
27962306a36Sopenharmony_ci *
28062306a36Sopenharmony_ci * Reading from a hole in a sparse xfile causes page instantiation, so for
28162306a36Sopenharmony_ci * iterating a (possibly sparse) array we need to figure out if the cursor is
28262306a36Sopenharmony_ci * pointing at a totally uninitialized hole and move the cursor up if
28362306a36Sopenharmony_ci * necessary.
28462306a36Sopenharmony_ci */
28562306a36Sopenharmony_cistatic inline int
28662306a36Sopenharmony_cixfarray_find_data(
28762306a36Sopenharmony_ci	struct xfarray	*array,
28862306a36Sopenharmony_ci	xfarray_idx_t	*cur,
28962306a36Sopenharmony_ci	loff_t		*pos)
29062306a36Sopenharmony_ci{
29162306a36Sopenharmony_ci	unsigned int	pgoff = offset_in_page(*pos);
29262306a36Sopenharmony_ci	loff_t		end_pos = *pos + array->obj_size - 1;
29362306a36Sopenharmony_ci	loff_t		new_pos;
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	/*
29662306a36Sopenharmony_ci	 * If the current array record is not adjacent to a page boundary, we
29762306a36Sopenharmony_ci	 * are in the middle of the page.  We do not need to move the cursor.
29862306a36Sopenharmony_ci	 */
29962306a36Sopenharmony_ci	if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE)
30062306a36Sopenharmony_ci		return 0;
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	/*
30362306a36Sopenharmony_ci	 * Call SEEK_DATA on the last byte in the record we're about to read.
30462306a36Sopenharmony_ci	 * If the record ends at (or crosses) the end of a page then we know
30562306a36Sopenharmony_ci	 * that the first byte of the record is backed by pages and don't need
30662306a36Sopenharmony_ci	 * to query it.  If instead the record begins at the start of the page
30762306a36Sopenharmony_ci	 * then we know that querying the last byte is just as good as querying
30862306a36Sopenharmony_ci	 * the first byte, since records cannot be larger than a page.
30962306a36Sopenharmony_ci	 *
31062306a36Sopenharmony_ci	 * If the call returns the same file offset, we know this record is
31162306a36Sopenharmony_ci	 * backed by real pages.  We do not need to move the cursor.
31262306a36Sopenharmony_ci	 */
31362306a36Sopenharmony_ci	new_pos = xfile_seek_data(array->xfile, end_pos);
31462306a36Sopenharmony_ci	if (new_pos == -ENXIO)
31562306a36Sopenharmony_ci		return -ENODATA;
31662306a36Sopenharmony_ci	if (new_pos < 0)
31762306a36Sopenharmony_ci		return new_pos;
31862306a36Sopenharmony_ci	if (new_pos == end_pos)
31962306a36Sopenharmony_ci		return 0;
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ci	/*
32262306a36Sopenharmony_ci	 * Otherwise, SEEK_DATA told us how far up to move the file pointer to
32362306a36Sopenharmony_ci	 * find more data.  Move the array index to the first record past the
32462306a36Sopenharmony_ci	 * byte offset we were given.
32562306a36Sopenharmony_ci	 */
32662306a36Sopenharmony_ci	new_pos = roundup_64(new_pos, array->obj_size);
32762306a36Sopenharmony_ci	*cur = xfarray_idx(array, new_pos);
32862306a36Sopenharmony_ci	*pos = xfarray_pos(array, *cur);
32962306a36Sopenharmony_ci	return 0;
33062306a36Sopenharmony_ci}
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci/*
33362306a36Sopenharmony_ci * Starting at *idx, fetch the next non-null array entry and advance the index
33462306a36Sopenharmony_ci * to set up the next _load_next call.  Returns ENODATA if we reach the end of
33562306a36Sopenharmony_ci * the array.  Callers must set @*idx to XFARRAY_CURSOR_INIT before the first
33662306a36Sopenharmony_ci * call to this function.
33762306a36Sopenharmony_ci */
33862306a36Sopenharmony_ciint
33962306a36Sopenharmony_cixfarray_load_next(
34062306a36Sopenharmony_ci	struct xfarray	*array,
34162306a36Sopenharmony_ci	xfarray_idx_t	*idx,
34262306a36Sopenharmony_ci	void		*rec)
34362306a36Sopenharmony_ci{
34462306a36Sopenharmony_ci	xfarray_idx_t	cur = *idx;
34562306a36Sopenharmony_ci	loff_t		pos = xfarray_pos(array, cur);
34662306a36Sopenharmony_ci	int		error;
34762306a36Sopenharmony_ci
34862306a36Sopenharmony_ci	do {
34962306a36Sopenharmony_ci		if (cur >= array->nr)
35062306a36Sopenharmony_ci			return -ENODATA;
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_ci		/*
35362306a36Sopenharmony_ci		 * Ask the backing store for the location of next possible
35462306a36Sopenharmony_ci		 * written record, then retrieve that record.
35562306a36Sopenharmony_ci		 */
35662306a36Sopenharmony_ci		error = xfarray_find_data(array, &cur, &pos);
35762306a36Sopenharmony_ci		if (error)
35862306a36Sopenharmony_ci			return error;
35962306a36Sopenharmony_ci		error = xfarray_load(array, cur, rec);
36062306a36Sopenharmony_ci		if (error)
36162306a36Sopenharmony_ci			return error;
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci		cur++;
36462306a36Sopenharmony_ci		pos += array->obj_size;
36562306a36Sopenharmony_ci	} while (xfarray_element_is_null(array, rec));
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci	*idx = cur;
36862306a36Sopenharmony_ci	return 0;
36962306a36Sopenharmony_ci}
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_ci/* Sorting functions */
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci#ifdef DEBUG
37462306a36Sopenharmony_ci# define xfarray_sort_bump_loads(si)	do { (si)->loads++; } while (0)
37562306a36Sopenharmony_ci# define xfarray_sort_bump_stores(si)	do { (si)->stores++; } while (0)
37662306a36Sopenharmony_ci# define xfarray_sort_bump_compares(si)	do { (si)->compares++; } while (0)
37762306a36Sopenharmony_ci# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0)
37862306a36Sopenharmony_ci#else
37962306a36Sopenharmony_ci# define xfarray_sort_bump_loads(si)
38062306a36Sopenharmony_ci# define xfarray_sort_bump_stores(si)
38162306a36Sopenharmony_ci# define xfarray_sort_bump_compares(si)
38262306a36Sopenharmony_ci# define xfarray_sort_bump_heapsorts(si)
38362306a36Sopenharmony_ci#endif /* DEBUG */
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci/* Load an array element for sorting. */
38662306a36Sopenharmony_cistatic inline int
38762306a36Sopenharmony_cixfarray_sort_load(
38862306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
38962306a36Sopenharmony_ci	xfarray_idx_t		idx,
39062306a36Sopenharmony_ci	void			*ptr)
39162306a36Sopenharmony_ci{
39262306a36Sopenharmony_ci	xfarray_sort_bump_loads(si);
39362306a36Sopenharmony_ci	return xfarray_load(si->array, idx, ptr);
39462306a36Sopenharmony_ci}
39562306a36Sopenharmony_ci
39662306a36Sopenharmony_ci/* Store an array element for sorting. */
39762306a36Sopenharmony_cistatic inline int
39862306a36Sopenharmony_cixfarray_sort_store(
39962306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
40062306a36Sopenharmony_ci	xfarray_idx_t		idx,
40162306a36Sopenharmony_ci	void			*ptr)
40262306a36Sopenharmony_ci{
40362306a36Sopenharmony_ci	xfarray_sort_bump_stores(si);
40462306a36Sopenharmony_ci	return xfarray_store(si->array, idx, ptr);
40562306a36Sopenharmony_ci}
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_ci/* Compare an array element for sorting. */
40862306a36Sopenharmony_cistatic inline int
40962306a36Sopenharmony_cixfarray_sort_cmp(
41062306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
41162306a36Sopenharmony_ci	const void		*a,
41262306a36Sopenharmony_ci	const void		*b)
41362306a36Sopenharmony_ci{
41462306a36Sopenharmony_ci	xfarray_sort_bump_compares(si);
41562306a36Sopenharmony_ci	return si->cmp_fn(a, b);
41662306a36Sopenharmony_ci}
41762306a36Sopenharmony_ci
41862306a36Sopenharmony_ci/* Return a pointer to the low index stack for quicksort partitioning. */
41962306a36Sopenharmony_cistatic inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si)
42062306a36Sopenharmony_ci{
42162306a36Sopenharmony_ci	return (xfarray_idx_t *)(si + 1);
42262306a36Sopenharmony_ci}
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci/* Return a pointer to the high index stack for quicksort partitioning. */
42562306a36Sopenharmony_cistatic inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si)
42662306a36Sopenharmony_ci{
42762306a36Sopenharmony_ci	return xfarray_sortinfo_lo(si) + si->max_stack_depth;
42862306a36Sopenharmony_ci}
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_ci/* Size of each element in the quicksort pivot array. */
43162306a36Sopenharmony_cistatic inline size_t
43262306a36Sopenharmony_cixfarray_pivot_rec_sz(
43362306a36Sopenharmony_ci	struct xfarray		*array)
43462306a36Sopenharmony_ci{
43562306a36Sopenharmony_ci	return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t);
43662306a36Sopenharmony_ci}
43762306a36Sopenharmony_ci
43862306a36Sopenharmony_ci/* Allocate memory to handle the sort. */
43962306a36Sopenharmony_cistatic inline int
44062306a36Sopenharmony_cixfarray_sortinfo_alloc(
44162306a36Sopenharmony_ci	struct xfarray		*array,
44262306a36Sopenharmony_ci	xfarray_cmp_fn		cmp_fn,
44362306a36Sopenharmony_ci	unsigned int		flags,
44462306a36Sopenharmony_ci	struct xfarray_sortinfo	**infop)
44562306a36Sopenharmony_ci{
44662306a36Sopenharmony_ci	struct xfarray_sortinfo	*si;
44762306a36Sopenharmony_ci	size_t			nr_bytes = sizeof(struct xfarray_sortinfo);
44862306a36Sopenharmony_ci	size_t			pivot_rec_sz = xfarray_pivot_rec_sz(array);
44962306a36Sopenharmony_ci	int			max_stack_depth;
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci	/*
45262306a36Sopenharmony_ci	 * The median-of-nine pivot algorithm doesn't work if a subset has
45362306a36Sopenharmony_ci	 * fewer than 9 items.  Make sure the in-memory sort will always take
45462306a36Sopenharmony_ci	 * over for subsets where this wouldn't be the case.
45562306a36Sopenharmony_ci	 */
45662306a36Sopenharmony_ci	BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR);
45762306a36Sopenharmony_ci
45862306a36Sopenharmony_ci	/*
45962306a36Sopenharmony_ci	 * Tail-call recursion during the partitioning phase means that
46062306a36Sopenharmony_ci	 * quicksort will never recurse more than log2(nr) times.  We need one
46162306a36Sopenharmony_ci	 * extra level of stack to hold the initial parameters.  In-memory
46262306a36Sopenharmony_ci	 * sort will always take care of the last few levels of recursion for
46362306a36Sopenharmony_ci	 * us, so we can reduce the stack depth by that much.
46462306a36Sopenharmony_ci	 */
46562306a36Sopenharmony_ci	max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1);
46662306a36Sopenharmony_ci	if (max_stack_depth < 1)
46762306a36Sopenharmony_ci		max_stack_depth = 1;
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	/* Each level of quicksort uses a lo and a hi index */
47062306a36Sopenharmony_ci	nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2;
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_ci	/* Scratchpad for in-memory sort, or finding the pivot */
47362306a36Sopenharmony_ci	nr_bytes += max_t(size_t,
47462306a36Sopenharmony_ci			(XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz,
47562306a36Sopenharmony_ci			XFARRAY_ISORT_NR * array->obj_size);
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_ci	si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS);
47862306a36Sopenharmony_ci	if (!si)
47962306a36Sopenharmony_ci		return -ENOMEM;
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci	si->array = array;
48262306a36Sopenharmony_ci	si->cmp_fn = cmp_fn;
48362306a36Sopenharmony_ci	si->flags = flags;
48462306a36Sopenharmony_ci	si->max_stack_depth = max_stack_depth;
48562306a36Sopenharmony_ci	si->max_stack_used = 1;
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci	xfarray_sortinfo_lo(si)[0] = 0;
48862306a36Sopenharmony_ci	xfarray_sortinfo_hi(si)[0] = array->nr - 1;
48962306a36Sopenharmony_ci
49062306a36Sopenharmony_ci	trace_xfarray_sort(si, nr_bytes);
49162306a36Sopenharmony_ci	*infop = si;
49262306a36Sopenharmony_ci	return 0;
49362306a36Sopenharmony_ci}
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_ci/* Should this sort be terminated by a fatal signal? */
49662306a36Sopenharmony_cistatic inline bool
49762306a36Sopenharmony_cixfarray_sort_terminated(
49862306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
49962306a36Sopenharmony_ci	int			*error)
50062306a36Sopenharmony_ci{
50162306a36Sopenharmony_ci	/*
50262306a36Sopenharmony_ci	 * If preemption is disabled, we need to yield to the scheduler every
50362306a36Sopenharmony_ci	 * few seconds so that we don't run afoul of the soft lockup watchdog
50462306a36Sopenharmony_ci	 * or RCU stall detector.
50562306a36Sopenharmony_ci	 */
50662306a36Sopenharmony_ci	cond_resched();
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	if ((si->flags & XFARRAY_SORT_KILLABLE) &&
50962306a36Sopenharmony_ci	    fatal_signal_pending(current)) {
51062306a36Sopenharmony_ci		if (*error == 0)
51162306a36Sopenharmony_ci			*error = -EINTR;
51262306a36Sopenharmony_ci		return true;
51362306a36Sopenharmony_ci	}
51462306a36Sopenharmony_ci	return false;
51562306a36Sopenharmony_ci}
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci/* Do we want an in-memory sort? */
51862306a36Sopenharmony_cistatic inline bool
51962306a36Sopenharmony_cixfarray_want_isort(
52062306a36Sopenharmony_ci	struct xfarray_sortinfo *si,
52162306a36Sopenharmony_ci	xfarray_idx_t		start,
52262306a36Sopenharmony_ci	xfarray_idx_t		end)
52362306a36Sopenharmony_ci{
52462306a36Sopenharmony_ci	/*
52562306a36Sopenharmony_ci	 * For array subsets that fit in the scratchpad, it's much faster to
52662306a36Sopenharmony_ci	 * use the kernel's heapsort than quicksort's stack machine.
52762306a36Sopenharmony_ci	 */
52862306a36Sopenharmony_ci	return (end - start) < XFARRAY_ISORT_NR;
52962306a36Sopenharmony_ci}
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci/* Return the scratch space within the sortinfo structure. */
53262306a36Sopenharmony_cistatic inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si)
53362306a36Sopenharmony_ci{
53462306a36Sopenharmony_ci	return xfarray_sortinfo_hi(si) + si->max_stack_depth;
53562306a36Sopenharmony_ci}
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci/*
53862306a36Sopenharmony_ci * Sort a small number of array records using scratchpad memory.  The records
53962306a36Sopenharmony_ci * need not be contiguous in the xfile's memory pages.
54062306a36Sopenharmony_ci */
54162306a36Sopenharmony_ciSTATIC int
54262306a36Sopenharmony_cixfarray_isort(
54362306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
54462306a36Sopenharmony_ci	xfarray_idx_t		lo,
54562306a36Sopenharmony_ci	xfarray_idx_t		hi)
54662306a36Sopenharmony_ci{
54762306a36Sopenharmony_ci	void			*scratch = xfarray_sortinfo_isort_scratch(si);
54862306a36Sopenharmony_ci	loff_t			lo_pos = xfarray_pos(si->array, lo);
54962306a36Sopenharmony_ci	loff_t			len = xfarray_pos(si->array, hi - lo + 1);
55062306a36Sopenharmony_ci	int			error;
55162306a36Sopenharmony_ci
55262306a36Sopenharmony_ci	trace_xfarray_isort(si, lo, hi);
55362306a36Sopenharmony_ci
55462306a36Sopenharmony_ci	xfarray_sort_bump_loads(si);
55562306a36Sopenharmony_ci	error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
55662306a36Sopenharmony_ci	if (error)
55762306a36Sopenharmony_ci		return error;
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci	xfarray_sort_bump_heapsorts(si);
56062306a36Sopenharmony_ci	sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
56162306a36Sopenharmony_ci
56262306a36Sopenharmony_ci	xfarray_sort_bump_stores(si);
56362306a36Sopenharmony_ci	return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
56462306a36Sopenharmony_ci}
56562306a36Sopenharmony_ci
56662306a36Sopenharmony_ci/* Grab a page for sorting records. */
56762306a36Sopenharmony_cistatic inline int
56862306a36Sopenharmony_cixfarray_sort_get_page(
56962306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
57062306a36Sopenharmony_ci	loff_t			pos,
57162306a36Sopenharmony_ci	uint64_t		len)
57262306a36Sopenharmony_ci{
57362306a36Sopenharmony_ci	int			error;
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci	error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
57662306a36Sopenharmony_ci	if (error)
57762306a36Sopenharmony_ci		return error;
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci	/*
58062306a36Sopenharmony_ci	 * xfile pages must never be mapped into userspace, so we skip the
58162306a36Sopenharmony_ci	 * dcache flush when mapping the page.
58262306a36Sopenharmony_ci	 */
58362306a36Sopenharmony_ci	si->page_kaddr = kmap_local_page(si->xfpage.page);
58462306a36Sopenharmony_ci	return 0;
58562306a36Sopenharmony_ci}
58662306a36Sopenharmony_ci
58762306a36Sopenharmony_ci/* Release a page we grabbed for sorting records. */
58862306a36Sopenharmony_cistatic inline int
58962306a36Sopenharmony_cixfarray_sort_put_page(
59062306a36Sopenharmony_ci	struct xfarray_sortinfo	*si)
59162306a36Sopenharmony_ci{
59262306a36Sopenharmony_ci	if (!si->page_kaddr)
59362306a36Sopenharmony_ci		return 0;
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ci	kunmap_local(si->page_kaddr);
59662306a36Sopenharmony_ci	si->page_kaddr = NULL;
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci	return xfile_put_page(si->array->xfile, &si->xfpage);
59962306a36Sopenharmony_ci}
60062306a36Sopenharmony_ci
60162306a36Sopenharmony_ci/* Decide if these records are eligible for in-page sorting. */
60262306a36Sopenharmony_cistatic inline bool
60362306a36Sopenharmony_cixfarray_want_pagesort(
60462306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
60562306a36Sopenharmony_ci	xfarray_idx_t		lo,
60662306a36Sopenharmony_ci	xfarray_idx_t		hi)
60762306a36Sopenharmony_ci{
60862306a36Sopenharmony_ci	pgoff_t			lo_page;
60962306a36Sopenharmony_ci	pgoff_t			hi_page;
61062306a36Sopenharmony_ci	loff_t			end_pos;
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	/* We can only map one page at a time. */
61362306a36Sopenharmony_ci	lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
61462306a36Sopenharmony_ci	end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
61562306a36Sopenharmony_ci	hi_page = end_pos >> PAGE_SHIFT;
61662306a36Sopenharmony_ci
61762306a36Sopenharmony_ci	return lo_page == hi_page;
61862306a36Sopenharmony_ci}
61962306a36Sopenharmony_ci
62062306a36Sopenharmony_ci/* Sort a bunch of records that all live in the same memory page. */
62162306a36Sopenharmony_ciSTATIC int
62262306a36Sopenharmony_cixfarray_pagesort(
62362306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
62462306a36Sopenharmony_ci	xfarray_idx_t		lo,
62562306a36Sopenharmony_ci	xfarray_idx_t		hi)
62662306a36Sopenharmony_ci{
62762306a36Sopenharmony_ci	void			*startp;
62862306a36Sopenharmony_ci	loff_t			lo_pos = xfarray_pos(si->array, lo);
62962306a36Sopenharmony_ci	uint64_t		len = xfarray_pos(si->array, hi - lo);
63062306a36Sopenharmony_ci	int			error = 0;
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci	trace_xfarray_pagesort(si, lo, hi);
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci	xfarray_sort_bump_loads(si);
63562306a36Sopenharmony_ci	error = xfarray_sort_get_page(si, lo_pos, len);
63662306a36Sopenharmony_ci	if (error)
63762306a36Sopenharmony_ci		return error;
63862306a36Sopenharmony_ci
63962306a36Sopenharmony_ci	xfarray_sort_bump_heapsorts(si);
64062306a36Sopenharmony_ci	startp = si->page_kaddr + offset_in_page(lo_pos);
64162306a36Sopenharmony_ci	sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
64262306a36Sopenharmony_ci
64362306a36Sopenharmony_ci	xfarray_sort_bump_stores(si);
64462306a36Sopenharmony_ci	return xfarray_sort_put_page(si);
64562306a36Sopenharmony_ci}
64662306a36Sopenharmony_ci
64762306a36Sopenharmony_ci/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
64862306a36Sopenharmony_cistatic inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si)
64962306a36Sopenharmony_ci{
65062306a36Sopenharmony_ci	return xfarray_sortinfo_hi(si) + si->max_stack_depth;
65162306a36Sopenharmony_ci}
65262306a36Sopenharmony_ci
65362306a36Sopenharmony_ci/* Return a pointer to the start of the pivot array. */
65462306a36Sopenharmony_cistatic inline void *
65562306a36Sopenharmony_cixfarray_sortinfo_pivot_array(
65662306a36Sopenharmony_ci	struct xfarray_sortinfo	*si)
65762306a36Sopenharmony_ci{
65862306a36Sopenharmony_ci	return xfarray_sortinfo_pivot(si) + si->array->obj_size;
65962306a36Sopenharmony_ci}
66062306a36Sopenharmony_ci
66162306a36Sopenharmony_ci/* The xfarray record is stored at the start of each pivot array element. */
66262306a36Sopenharmony_cistatic inline void *
66362306a36Sopenharmony_cixfarray_pivot_array_rec(
66462306a36Sopenharmony_ci	void			*pa,
66562306a36Sopenharmony_ci	size_t			pa_recsz,
66662306a36Sopenharmony_ci	unsigned int		pa_idx)
66762306a36Sopenharmony_ci{
66862306a36Sopenharmony_ci	return pa + (pa_recsz * pa_idx);
66962306a36Sopenharmony_ci}
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci/* The xfarray index is stored at the end of each pivot array element. */
67262306a36Sopenharmony_cistatic inline xfarray_idx_t *
67362306a36Sopenharmony_cixfarray_pivot_array_idx(
67462306a36Sopenharmony_ci	void			*pa,
67562306a36Sopenharmony_ci	size_t			pa_recsz,
67662306a36Sopenharmony_ci	unsigned int		pa_idx)
67762306a36Sopenharmony_ci{
67862306a36Sopenharmony_ci	return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) -
67962306a36Sopenharmony_ci			sizeof(xfarray_idx_t);
68062306a36Sopenharmony_ci}
68162306a36Sopenharmony_ci
68262306a36Sopenharmony_ci/*
68362306a36Sopenharmony_ci * Find a pivot value for quicksort partitioning, swap it with a[lo], and save
68462306a36Sopenharmony_ci * the cached pivot record for the next step.
68562306a36Sopenharmony_ci *
68662306a36Sopenharmony_ci * Load evenly-spaced records within the given range into memory, sort them,
68762306a36Sopenharmony_ci * and choose the pivot from the median record.  Using multiple points will
68862306a36Sopenharmony_ci * improve the quality of the pivot selection, and hopefully avoid the worst
68962306a36Sopenharmony_ci * quicksort behavior, since our array values are nearly always evenly sorted.
69062306a36Sopenharmony_ci */
69162306a36Sopenharmony_ciSTATIC int
69262306a36Sopenharmony_cixfarray_qsort_pivot(
69362306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
69462306a36Sopenharmony_ci	xfarray_idx_t		lo,
69562306a36Sopenharmony_ci	xfarray_idx_t		hi)
69662306a36Sopenharmony_ci{
69762306a36Sopenharmony_ci	void			*pivot = xfarray_sortinfo_pivot(si);
69862306a36Sopenharmony_ci	void			*parray = xfarray_sortinfo_pivot_array(si);
69962306a36Sopenharmony_ci	void			*recp;
70062306a36Sopenharmony_ci	xfarray_idx_t		*idxp;
70162306a36Sopenharmony_ci	xfarray_idx_t		step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1);
70262306a36Sopenharmony_ci	size_t			pivot_rec_sz = xfarray_pivot_rec_sz(si->array);
70362306a36Sopenharmony_ci	int			i, j;
70462306a36Sopenharmony_ci	int			error;
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_ci	ASSERT(step > 0);
70762306a36Sopenharmony_ci
70862306a36Sopenharmony_ci	/*
70962306a36Sopenharmony_ci	 * Load the xfarray indexes of the records we intend to sample into the
71062306a36Sopenharmony_ci	 * pivot array.
71162306a36Sopenharmony_ci	 */
71262306a36Sopenharmony_ci	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0);
71362306a36Sopenharmony_ci	*idxp = lo;
71462306a36Sopenharmony_ci	for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) {
71562306a36Sopenharmony_ci		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
71662306a36Sopenharmony_ci		*idxp = lo + (i * step);
71762306a36Sopenharmony_ci	}
71862306a36Sopenharmony_ci	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
71962306a36Sopenharmony_ci			XFARRAY_QSORT_PIVOT_NR - 1);
72062306a36Sopenharmony_ci	*idxp = hi;
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci	/* Load the selected xfarray records into the pivot array. */
72362306a36Sopenharmony_ci	for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) {
72462306a36Sopenharmony_ci		xfarray_idx_t	idx;
72562306a36Sopenharmony_ci
72662306a36Sopenharmony_ci		recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i);
72762306a36Sopenharmony_ci		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
72862306a36Sopenharmony_ci
72962306a36Sopenharmony_ci		/* No unset records; load directly into the array. */
73062306a36Sopenharmony_ci		if (likely(si->array->unset_slots == 0)) {
73162306a36Sopenharmony_ci			error = xfarray_sort_load(si, *idxp, recp);
73262306a36Sopenharmony_ci			if (error)
73362306a36Sopenharmony_ci				return error;
73462306a36Sopenharmony_ci			continue;
73562306a36Sopenharmony_ci		}
73662306a36Sopenharmony_ci
73762306a36Sopenharmony_ci		/*
73862306a36Sopenharmony_ci		 * Load non-null records into the scratchpad without changing
73962306a36Sopenharmony_ci		 * the xfarray_idx_t in the pivot array.
74062306a36Sopenharmony_ci		 */
74162306a36Sopenharmony_ci		idx = *idxp;
74262306a36Sopenharmony_ci		xfarray_sort_bump_loads(si);
74362306a36Sopenharmony_ci		error = xfarray_load_next(si->array, &idx, recp);
74462306a36Sopenharmony_ci		if (error)
74562306a36Sopenharmony_ci			return error;
74662306a36Sopenharmony_ci	}
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci	xfarray_sort_bump_heapsorts(si);
74962306a36Sopenharmony_ci	sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL);
75062306a36Sopenharmony_ci
75162306a36Sopenharmony_ci	/*
75262306a36Sopenharmony_ci	 * We sorted the pivot array records (which includes the xfarray
75362306a36Sopenharmony_ci	 * indices) in xfarray record order.  The median element of the pivot
75462306a36Sopenharmony_ci	 * array contains the xfarray record that we will use as the pivot.
75562306a36Sopenharmony_ci	 * Copy that xfarray record to the designated space.
75662306a36Sopenharmony_ci	 */
75762306a36Sopenharmony_ci	recp = xfarray_pivot_array_rec(parray, pivot_rec_sz,
75862306a36Sopenharmony_ci			XFARRAY_QSORT_PIVOT_NR / 2);
75962306a36Sopenharmony_ci	memcpy(pivot, recp, si->array->obj_size);
76062306a36Sopenharmony_ci
76162306a36Sopenharmony_ci	/* If the pivot record we chose was already in a[lo] then we're done. */
76262306a36Sopenharmony_ci	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
76362306a36Sopenharmony_ci			XFARRAY_QSORT_PIVOT_NR / 2);
76462306a36Sopenharmony_ci	if (*idxp == lo)
76562306a36Sopenharmony_ci		return 0;
76662306a36Sopenharmony_ci
76762306a36Sopenharmony_ci	/*
76862306a36Sopenharmony_ci	 * Find the cached copy of a[lo] in the pivot array so that we can swap
76962306a36Sopenharmony_ci	 * a[lo] and a[pivot].
77062306a36Sopenharmony_ci	 */
77162306a36Sopenharmony_ci	for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) {
77262306a36Sopenharmony_ci		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
77362306a36Sopenharmony_ci		if (*idxp == lo)
77462306a36Sopenharmony_ci			j = i;
77562306a36Sopenharmony_ci	}
77662306a36Sopenharmony_ci	if (j < 0) {
77762306a36Sopenharmony_ci		ASSERT(j >= 0);
77862306a36Sopenharmony_ci		return -EFSCORRUPTED;
77962306a36Sopenharmony_ci	}
78062306a36Sopenharmony_ci
78162306a36Sopenharmony_ci	/* Swap a[lo] and a[pivot]. */
78262306a36Sopenharmony_ci	error = xfarray_sort_store(si, lo, pivot);
78362306a36Sopenharmony_ci	if (error)
78462306a36Sopenharmony_ci		return error;
78562306a36Sopenharmony_ci
78662306a36Sopenharmony_ci	recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j);
78762306a36Sopenharmony_ci	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
78862306a36Sopenharmony_ci			XFARRAY_QSORT_PIVOT_NR / 2);
78962306a36Sopenharmony_ci	return xfarray_sort_store(si, *idxp, recp);
79062306a36Sopenharmony_ci}
79162306a36Sopenharmony_ci
79262306a36Sopenharmony_ci/*
79362306a36Sopenharmony_ci * Set up the pointers for the next iteration.  We push onto the stack all of
79462306a36Sopenharmony_ci * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the
79562306a36Sopenharmony_ci * current stack frame to point to the unsorted values between a[beg[i]] and
79662306a36Sopenharmony_ci * a[lo] so that those values will be sorted when we pop the stack.
79762306a36Sopenharmony_ci */
79862306a36Sopenharmony_cistatic inline int
79962306a36Sopenharmony_cixfarray_qsort_push(
80062306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
80162306a36Sopenharmony_ci	xfarray_idx_t		*si_lo,
80262306a36Sopenharmony_ci	xfarray_idx_t		*si_hi,
80362306a36Sopenharmony_ci	xfarray_idx_t		lo,
80462306a36Sopenharmony_ci	xfarray_idx_t		hi)
80562306a36Sopenharmony_ci{
80662306a36Sopenharmony_ci	/* Check for stack overflows */
80762306a36Sopenharmony_ci	if (si->stack_depth >= si->max_stack_depth - 1) {
80862306a36Sopenharmony_ci		ASSERT(si->stack_depth < si->max_stack_depth - 1);
80962306a36Sopenharmony_ci		return -EFSCORRUPTED;
81062306a36Sopenharmony_ci	}
81162306a36Sopenharmony_ci
81262306a36Sopenharmony_ci	si->max_stack_used = max_t(uint8_t, si->max_stack_used,
81362306a36Sopenharmony_ci					    si->stack_depth + 2);
81462306a36Sopenharmony_ci
81562306a36Sopenharmony_ci	si_lo[si->stack_depth + 1] = lo + 1;
81662306a36Sopenharmony_ci	si_hi[si->stack_depth + 1] = si_hi[si->stack_depth];
81762306a36Sopenharmony_ci	si_hi[si->stack_depth++] = lo - 1;
81862306a36Sopenharmony_ci
81962306a36Sopenharmony_ci	/*
82062306a36Sopenharmony_ci	 * Always start with the smaller of the two partitions to keep the
82162306a36Sopenharmony_ci	 * amount of recursion in check.
82262306a36Sopenharmony_ci	 */
82362306a36Sopenharmony_ci	if (si_hi[si->stack_depth]     - si_lo[si->stack_depth] >
82462306a36Sopenharmony_ci	    si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) {
82562306a36Sopenharmony_ci		swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]);
82662306a36Sopenharmony_ci		swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]);
82762306a36Sopenharmony_ci	}
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_ci	return 0;
83062306a36Sopenharmony_ci}
83162306a36Sopenharmony_ci
83262306a36Sopenharmony_ci/*
83362306a36Sopenharmony_ci * Load an element from the array into the first scratchpad and cache the page,
83462306a36Sopenharmony_ci * if possible.
83562306a36Sopenharmony_ci */
83662306a36Sopenharmony_cistatic inline int
83762306a36Sopenharmony_cixfarray_sort_load_cached(
83862306a36Sopenharmony_ci	struct xfarray_sortinfo	*si,
83962306a36Sopenharmony_ci	xfarray_idx_t		idx,
84062306a36Sopenharmony_ci	void			*ptr)
84162306a36Sopenharmony_ci{
84262306a36Sopenharmony_ci	loff_t			idx_pos = xfarray_pos(si->array, idx);
84362306a36Sopenharmony_ci	pgoff_t			startpage;
84462306a36Sopenharmony_ci	pgoff_t			endpage;
84562306a36Sopenharmony_ci	int			error = 0;
84662306a36Sopenharmony_ci
84762306a36Sopenharmony_ci	/*
84862306a36Sopenharmony_ci	 * If this load would split a page, release the cached page, if any,
84962306a36Sopenharmony_ci	 * and perform a traditional read.
85062306a36Sopenharmony_ci	 */
85162306a36Sopenharmony_ci	startpage = idx_pos >> PAGE_SHIFT;
85262306a36Sopenharmony_ci	endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
85362306a36Sopenharmony_ci	if (startpage != endpage) {
85462306a36Sopenharmony_ci		error = xfarray_sort_put_page(si);
85562306a36Sopenharmony_ci		if (error)
85662306a36Sopenharmony_ci			return error;
85762306a36Sopenharmony_ci
85862306a36Sopenharmony_ci		if (xfarray_sort_terminated(si, &error))
85962306a36Sopenharmony_ci			return error;
86062306a36Sopenharmony_ci
86162306a36Sopenharmony_ci		return xfile_obj_load(si->array->xfile, ptr,
86262306a36Sopenharmony_ci				si->array->obj_size, idx_pos);
86362306a36Sopenharmony_ci	}
86462306a36Sopenharmony_ci
86562306a36Sopenharmony_ci	/* If the cached page is not the one we want, release it. */
86662306a36Sopenharmony_ci	if (xfile_page_cached(&si->xfpage) &&
86762306a36Sopenharmony_ci	    xfile_page_index(&si->xfpage) != startpage) {
86862306a36Sopenharmony_ci		error = xfarray_sort_put_page(si);
86962306a36Sopenharmony_ci		if (error)
87062306a36Sopenharmony_ci			return error;
87162306a36Sopenharmony_ci	}
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci	/*
87462306a36Sopenharmony_ci	 * If we don't have a cached page (and we know the load is contained
87562306a36Sopenharmony_ci	 * in a single page) then grab it.
87662306a36Sopenharmony_ci	 */
87762306a36Sopenharmony_ci	if (!xfile_page_cached(&si->xfpage)) {
87862306a36Sopenharmony_ci		if (xfarray_sort_terminated(si, &error))
87962306a36Sopenharmony_ci			return error;
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ci		error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
88262306a36Sopenharmony_ci				PAGE_SIZE);
88362306a36Sopenharmony_ci		if (error)
88462306a36Sopenharmony_ci			return error;
88562306a36Sopenharmony_ci	}
88662306a36Sopenharmony_ci
88762306a36Sopenharmony_ci	memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
88862306a36Sopenharmony_ci			si->array->obj_size);
88962306a36Sopenharmony_ci	return 0;
89062306a36Sopenharmony_ci}
89162306a36Sopenharmony_ci
89262306a36Sopenharmony_ci/*
89362306a36Sopenharmony_ci * Sort the array elements via quicksort.  This implementation incorporates
89462306a36Sopenharmony_ci * four optimizations discussed in Sedgewick:
89562306a36Sopenharmony_ci *
89662306a36Sopenharmony_ci * 1. Use an explicit stack of array indices to store the next array partition
89762306a36Sopenharmony_ci *    to sort.  This helps us to avoid recursion in the call stack, which is
89862306a36Sopenharmony_ci *    particularly expensive in the kernel.
89962306a36Sopenharmony_ci *
90062306a36Sopenharmony_ci * 2. For arrays with records in arbitrary or user-controlled order, choose the
90162306a36Sopenharmony_ci *    pivot element using a median-of-nine decision tree.  This reduces the
90262306a36Sopenharmony_ci *    probability of selecting a bad pivot value which causes worst case
90362306a36Sopenharmony_ci *    behavior (i.e. partition sizes of 1).
90462306a36Sopenharmony_ci *
90562306a36Sopenharmony_ci * 3. The smaller of the two sub-partitions is pushed onto the stack to start
90662306a36Sopenharmony_ci *    the next level of recursion, and the larger sub-partition replaces the
90762306a36Sopenharmony_ci *    current stack frame.  This guarantees that we won't need more than
90862306a36Sopenharmony_ci *    log2(nr) stack space.
90962306a36Sopenharmony_ci *
91062306a36Sopenharmony_ci * 4. For small sets, load the records into the scratchpad and run heapsort on
91162306a36Sopenharmony_ci *    them because that is very fast.  In the author's experience, this yields
91262306a36Sopenharmony_ci *    a ~10% reduction in runtime.
91362306a36Sopenharmony_ci *
91462306a36Sopenharmony_ci *    If a small set is contained entirely within a single xfile memory page,
91562306a36Sopenharmony_ci *    map the page directly and run heap sort directly on the xfile page
91662306a36Sopenharmony_ci *    instead of using the load/store interface.  This halves the runtime.
91762306a36Sopenharmony_ci *
91862306a36Sopenharmony_ci * 5. This optimization is specific to the implementation.  When converging lo
91962306a36Sopenharmony_ci *    and hi after selecting a pivot, we will try to retain the xfile memory
92062306a36Sopenharmony_ci *    page between load calls, which reduces run time by 50%.
92162306a36Sopenharmony_ci */
92262306a36Sopenharmony_ci
92362306a36Sopenharmony_ci/*
92462306a36Sopenharmony_ci * Due to the use of signed indices, we can only support up to 2^63 records.
92562306a36Sopenharmony_ci * Files can only grow to 2^63 bytes, so this is not much of a limitation.
92662306a36Sopenharmony_ci */
92762306a36Sopenharmony_ci#define QSORT_MAX_RECS		(1ULL << 63)
92862306a36Sopenharmony_ci
92962306a36Sopenharmony_ciint
93062306a36Sopenharmony_cixfarray_sort(
93162306a36Sopenharmony_ci	struct xfarray		*array,
93262306a36Sopenharmony_ci	xfarray_cmp_fn		cmp_fn,
93362306a36Sopenharmony_ci	unsigned int		flags)
93462306a36Sopenharmony_ci{
93562306a36Sopenharmony_ci	struct xfarray_sortinfo	*si;
93662306a36Sopenharmony_ci	xfarray_idx_t		*si_lo, *si_hi;
93762306a36Sopenharmony_ci	void			*pivot;
93862306a36Sopenharmony_ci	void			*scratch = xfarray_scratch(array);
93962306a36Sopenharmony_ci	xfarray_idx_t		lo, hi;
94062306a36Sopenharmony_ci	int			error = 0;
94162306a36Sopenharmony_ci
94262306a36Sopenharmony_ci	if (array->nr < 2)
94362306a36Sopenharmony_ci		return 0;
94462306a36Sopenharmony_ci	if (array->nr >= QSORT_MAX_RECS)
94562306a36Sopenharmony_ci		return -E2BIG;
94662306a36Sopenharmony_ci
94762306a36Sopenharmony_ci	error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si);
94862306a36Sopenharmony_ci	if (error)
94962306a36Sopenharmony_ci		return error;
95062306a36Sopenharmony_ci	si_lo = xfarray_sortinfo_lo(si);
95162306a36Sopenharmony_ci	si_hi = xfarray_sortinfo_hi(si);
95262306a36Sopenharmony_ci	pivot = xfarray_sortinfo_pivot(si);
95362306a36Sopenharmony_ci
95462306a36Sopenharmony_ci	while (si->stack_depth >= 0) {
95562306a36Sopenharmony_ci		lo = si_lo[si->stack_depth];
95662306a36Sopenharmony_ci		hi = si_hi[si->stack_depth];
95762306a36Sopenharmony_ci
95862306a36Sopenharmony_ci		trace_xfarray_qsort(si, lo, hi);
95962306a36Sopenharmony_ci
96062306a36Sopenharmony_ci		/* Nothing left in this partition to sort; pop stack. */
96162306a36Sopenharmony_ci		if (lo >= hi) {
96262306a36Sopenharmony_ci			si->stack_depth--;
96362306a36Sopenharmony_ci			continue;
96462306a36Sopenharmony_ci		}
96562306a36Sopenharmony_ci
96662306a36Sopenharmony_ci		/*
96762306a36Sopenharmony_ci		 * If directly mapping the page and sorting can solve our
96862306a36Sopenharmony_ci		 * problems, we're done.
96962306a36Sopenharmony_ci		 */
97062306a36Sopenharmony_ci		if (xfarray_want_pagesort(si, lo, hi)) {
97162306a36Sopenharmony_ci			error = xfarray_pagesort(si, lo, hi);
97262306a36Sopenharmony_ci			if (error)
97362306a36Sopenharmony_ci				goto out_free;
97462306a36Sopenharmony_ci			si->stack_depth--;
97562306a36Sopenharmony_ci			continue;
97662306a36Sopenharmony_ci		}
97762306a36Sopenharmony_ci
97862306a36Sopenharmony_ci		/* If insertion sort can solve our problems, we're done. */
97962306a36Sopenharmony_ci		if (xfarray_want_isort(si, lo, hi)) {
98062306a36Sopenharmony_ci			error = xfarray_isort(si, lo, hi);
98162306a36Sopenharmony_ci			if (error)
98262306a36Sopenharmony_ci				goto out_free;
98362306a36Sopenharmony_ci			si->stack_depth--;
98462306a36Sopenharmony_ci			continue;
98562306a36Sopenharmony_ci		}
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci		/* Pick a pivot, move it to a[lo] and stash it. */
98862306a36Sopenharmony_ci		error = xfarray_qsort_pivot(si, lo, hi);
98962306a36Sopenharmony_ci		if (error)
99062306a36Sopenharmony_ci			goto out_free;
99162306a36Sopenharmony_ci
99262306a36Sopenharmony_ci		/*
99362306a36Sopenharmony_ci		 * Rearrange a[lo..hi] such that everything smaller than the
99462306a36Sopenharmony_ci		 * pivot is on the left side of the range and everything larger
99562306a36Sopenharmony_ci		 * than the pivot is on the right side of the range.
99662306a36Sopenharmony_ci		 */
99762306a36Sopenharmony_ci		while (lo < hi) {
99862306a36Sopenharmony_ci			/*
99962306a36Sopenharmony_ci			 * Decrement hi until it finds an a[hi] less than the
100062306a36Sopenharmony_ci			 * pivot value.
100162306a36Sopenharmony_ci			 */
100262306a36Sopenharmony_ci			error = xfarray_sort_load_cached(si, hi, scratch);
100362306a36Sopenharmony_ci			if (error)
100462306a36Sopenharmony_ci				goto out_free;
100562306a36Sopenharmony_ci			while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
100662306a36Sopenharmony_ci								lo < hi) {
100762306a36Sopenharmony_ci				hi--;
100862306a36Sopenharmony_ci				error = xfarray_sort_load_cached(si, hi,
100962306a36Sopenharmony_ci						scratch);
101062306a36Sopenharmony_ci				if (error)
101162306a36Sopenharmony_ci					goto out_free;
101262306a36Sopenharmony_ci			}
101362306a36Sopenharmony_ci			error = xfarray_sort_put_page(si);
101462306a36Sopenharmony_ci			if (error)
101562306a36Sopenharmony_ci				goto out_free;
101662306a36Sopenharmony_ci
101762306a36Sopenharmony_ci			if (xfarray_sort_terminated(si, &error))
101862306a36Sopenharmony_ci				goto out_free;
101962306a36Sopenharmony_ci
102062306a36Sopenharmony_ci			/* Copy that item (a[hi]) to a[lo]. */
102162306a36Sopenharmony_ci			if (lo < hi) {
102262306a36Sopenharmony_ci				error = xfarray_sort_store(si, lo++, scratch);
102362306a36Sopenharmony_ci				if (error)
102462306a36Sopenharmony_ci					goto out_free;
102562306a36Sopenharmony_ci			}
102662306a36Sopenharmony_ci
102762306a36Sopenharmony_ci			/*
102862306a36Sopenharmony_ci			 * Increment lo until it finds an a[lo] greater than
102962306a36Sopenharmony_ci			 * the pivot value.
103062306a36Sopenharmony_ci			 */
103162306a36Sopenharmony_ci			error = xfarray_sort_load_cached(si, lo, scratch);
103262306a36Sopenharmony_ci			if (error)
103362306a36Sopenharmony_ci				goto out_free;
103462306a36Sopenharmony_ci			while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
103562306a36Sopenharmony_ci								lo < hi) {
103662306a36Sopenharmony_ci				lo++;
103762306a36Sopenharmony_ci				error = xfarray_sort_load_cached(si, lo,
103862306a36Sopenharmony_ci						scratch);
103962306a36Sopenharmony_ci				if (error)
104062306a36Sopenharmony_ci					goto out_free;
104162306a36Sopenharmony_ci			}
104262306a36Sopenharmony_ci			error = xfarray_sort_put_page(si);
104362306a36Sopenharmony_ci			if (error)
104462306a36Sopenharmony_ci				goto out_free;
104562306a36Sopenharmony_ci
104662306a36Sopenharmony_ci			if (xfarray_sort_terminated(si, &error))
104762306a36Sopenharmony_ci				goto out_free;
104862306a36Sopenharmony_ci
104962306a36Sopenharmony_ci			/* Copy that item (a[lo]) to a[hi]. */
105062306a36Sopenharmony_ci			if (lo < hi) {
105162306a36Sopenharmony_ci				error = xfarray_sort_store(si, hi--, scratch);
105262306a36Sopenharmony_ci				if (error)
105362306a36Sopenharmony_ci					goto out_free;
105462306a36Sopenharmony_ci			}
105562306a36Sopenharmony_ci
105662306a36Sopenharmony_ci			if (xfarray_sort_terminated(si, &error))
105762306a36Sopenharmony_ci				goto out_free;
105862306a36Sopenharmony_ci		}
105962306a36Sopenharmony_ci
106062306a36Sopenharmony_ci		/*
106162306a36Sopenharmony_ci		 * Put our pivot value in the correct place at a[lo].  All
106262306a36Sopenharmony_ci		 * values between a[beg[i]] and a[lo - 1] should be less than
106362306a36Sopenharmony_ci		 * the pivot; and all values between a[lo + 1] and a[end[i]-1]
106462306a36Sopenharmony_ci		 * should be greater than the pivot.
106562306a36Sopenharmony_ci		 */
106662306a36Sopenharmony_ci		error = xfarray_sort_store(si, lo, pivot);
106762306a36Sopenharmony_ci		if (error)
106862306a36Sopenharmony_ci			goto out_free;
106962306a36Sopenharmony_ci
107062306a36Sopenharmony_ci		/* Set up the stack frame to process the two partitions. */
107162306a36Sopenharmony_ci		error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi);
107262306a36Sopenharmony_ci		if (error)
107362306a36Sopenharmony_ci			goto out_free;
107462306a36Sopenharmony_ci
107562306a36Sopenharmony_ci		if (xfarray_sort_terminated(si, &error))
107662306a36Sopenharmony_ci			goto out_free;
107762306a36Sopenharmony_ci	}
107862306a36Sopenharmony_ci
107962306a36Sopenharmony_ciout_free:
108062306a36Sopenharmony_ci	trace_xfarray_sort_stats(si, error);
108162306a36Sopenharmony_ci	kvfree(si);
108262306a36Sopenharmony_ci	return error;
108362306a36Sopenharmony_ci}
1084