162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2021-2023 Oracle. All Rights Reserved. 462306a36Sopenharmony_ci * Author: Darrick J. Wong <djwong@kernel.org> 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci#include "xfs.h" 762306a36Sopenharmony_ci#include "xfs_fs.h" 862306a36Sopenharmony_ci#include "xfs_shared.h" 962306a36Sopenharmony_ci#include "xfs_format.h" 1062306a36Sopenharmony_ci#include "scrub/xfile.h" 1162306a36Sopenharmony_ci#include "scrub/xfarray.h" 1262306a36Sopenharmony_ci#include "scrub/scrub.h" 1362306a36Sopenharmony_ci#include "scrub/trace.h" 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci/* 1662306a36Sopenharmony_ci * Large Arrays of Fixed-Size Records 1762306a36Sopenharmony_ci * ================================== 1862306a36Sopenharmony_ci * 1962306a36Sopenharmony_ci * This memory array uses an xfile (which itself is a memfd "file") to store 2062306a36Sopenharmony_ci * large numbers of fixed-size records in memory that can be paged out. This 2162306a36Sopenharmony_ci * puts less stress on the memory reclaim algorithms during an online repair 2262306a36Sopenharmony_ci * because we don't have to pin so much memory. However, array access is less 2362306a36Sopenharmony_ci * direct than would be in a regular memory array. Access to the array is 2462306a36Sopenharmony_ci * performed via indexed load and store methods, and an append method is 2562306a36Sopenharmony_ci * provided for convenience. Array elements can be unset, which sets them to 2662306a36Sopenharmony_ci * all zeroes. Unset entries are skipped during iteration, though direct loads 2762306a36Sopenharmony_ci * will return a zeroed buffer. Callers are responsible for concurrency 2862306a36Sopenharmony_ci * control. 2962306a36Sopenharmony_ci */ 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci/* 3262306a36Sopenharmony_ci * Pointer to scratch space. Because we can't access the xfile data directly, 3362306a36Sopenharmony_ci * we allocate a small amount of memory on the end of the xfarray structure to 3462306a36Sopenharmony_ci * buffer array items when we need space to store values temporarily. 3562306a36Sopenharmony_ci */ 3662306a36Sopenharmony_cistatic inline void *xfarray_scratch(struct xfarray *array) 3762306a36Sopenharmony_ci{ 3862306a36Sopenharmony_ci return (array + 1); 3962306a36Sopenharmony_ci} 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci/* Compute array index given an xfile offset. */ 4262306a36Sopenharmony_cistatic xfarray_idx_t 4362306a36Sopenharmony_cixfarray_idx( 4462306a36Sopenharmony_ci struct xfarray *array, 4562306a36Sopenharmony_ci loff_t pos) 4662306a36Sopenharmony_ci{ 4762306a36Sopenharmony_ci if (array->obj_size_log >= 0) 4862306a36Sopenharmony_ci return (xfarray_idx_t)pos >> array->obj_size_log; 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci return div_u64((xfarray_idx_t)pos, array->obj_size); 5162306a36Sopenharmony_ci} 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci/* Compute xfile offset of array element. */ 5462306a36Sopenharmony_cistatic inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx) 5562306a36Sopenharmony_ci{ 5662306a36Sopenharmony_ci if (array->obj_size_log >= 0) 5762306a36Sopenharmony_ci return idx << array->obj_size_log; 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci return idx * array->obj_size; 6062306a36Sopenharmony_ci} 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci/* 6362306a36Sopenharmony_ci * Initialize a big memory array. Array records cannot be larger than a 6462306a36Sopenharmony_ci * page, and the array cannot span more bytes than the page cache supports. 6562306a36Sopenharmony_ci * If @required_capacity is nonzero, the maximum array size will be set to this 6662306a36Sopenharmony_ci * quantity and the array creation will fail if the underlying storage cannot 6762306a36Sopenharmony_ci * support that many records. 6862306a36Sopenharmony_ci */ 6962306a36Sopenharmony_ciint 7062306a36Sopenharmony_cixfarray_create( 7162306a36Sopenharmony_ci const char *description, 7262306a36Sopenharmony_ci unsigned long long required_capacity, 7362306a36Sopenharmony_ci size_t obj_size, 7462306a36Sopenharmony_ci struct xfarray **arrayp) 7562306a36Sopenharmony_ci{ 7662306a36Sopenharmony_ci struct xfarray *array; 7762306a36Sopenharmony_ci struct xfile *xfile; 7862306a36Sopenharmony_ci int error; 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci ASSERT(obj_size < PAGE_SIZE); 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci error = xfile_create(description, 0, &xfile); 8362306a36Sopenharmony_ci if (error) 8462306a36Sopenharmony_ci return error; 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci error = -ENOMEM; 8762306a36Sopenharmony_ci array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS); 8862306a36Sopenharmony_ci if (!array) 8962306a36Sopenharmony_ci goto out_xfile; 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci array->xfile = xfile; 9262306a36Sopenharmony_ci array->obj_size = obj_size; 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci if (is_power_of_2(obj_size)) 9562306a36Sopenharmony_ci array->obj_size_log = ilog2(obj_size); 9662306a36Sopenharmony_ci else 9762306a36Sopenharmony_ci array->obj_size_log = -1; 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE); 10062306a36Sopenharmony_ci trace_xfarray_create(array, required_capacity); 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ci if (required_capacity > 0) { 10362306a36Sopenharmony_ci if (array->max_nr < required_capacity) { 10462306a36Sopenharmony_ci error = -ENOMEM; 10562306a36Sopenharmony_ci goto out_xfarray; 10662306a36Sopenharmony_ci } 10762306a36Sopenharmony_ci array->max_nr = required_capacity; 10862306a36Sopenharmony_ci } 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci *arrayp = array; 11162306a36Sopenharmony_ci return 0; 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ciout_xfarray: 11462306a36Sopenharmony_ci kfree(array); 11562306a36Sopenharmony_ciout_xfile: 11662306a36Sopenharmony_ci xfile_destroy(xfile); 11762306a36Sopenharmony_ci return error; 11862306a36Sopenharmony_ci} 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci/* Destroy the array. */ 12162306a36Sopenharmony_civoid 12262306a36Sopenharmony_cixfarray_destroy( 12362306a36Sopenharmony_ci struct xfarray *array) 12462306a36Sopenharmony_ci{ 12562306a36Sopenharmony_ci xfile_destroy(array->xfile); 12662306a36Sopenharmony_ci kfree(array); 12762306a36Sopenharmony_ci} 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci/* Load an element from the array. */ 13062306a36Sopenharmony_ciint 13162306a36Sopenharmony_cixfarray_load( 13262306a36Sopenharmony_ci struct xfarray *array, 13362306a36Sopenharmony_ci xfarray_idx_t idx, 13462306a36Sopenharmony_ci void *ptr) 13562306a36Sopenharmony_ci{ 13662306a36Sopenharmony_ci if (idx >= array->nr) 13762306a36Sopenharmony_ci return -ENODATA; 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci return xfile_obj_load(array->xfile, ptr, array->obj_size, 14062306a36Sopenharmony_ci xfarray_pos(array, idx)); 14162306a36Sopenharmony_ci} 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci/* Is this array element potentially unset? */ 14462306a36Sopenharmony_cistatic inline bool 14562306a36Sopenharmony_cixfarray_is_unset( 14662306a36Sopenharmony_ci struct xfarray *array, 14762306a36Sopenharmony_ci loff_t pos) 14862306a36Sopenharmony_ci{ 14962306a36Sopenharmony_ci void *temp = xfarray_scratch(array); 15062306a36Sopenharmony_ci int error; 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci if (array->unset_slots == 0) 15362306a36Sopenharmony_ci return false; 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci error = xfile_obj_load(array->xfile, temp, array->obj_size, pos); 15662306a36Sopenharmony_ci if (!error && xfarray_element_is_null(array, temp)) 15762306a36Sopenharmony_ci return true; 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_ci return false; 16062306a36Sopenharmony_ci} 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci/* 16362306a36Sopenharmony_ci * Unset an array element. If @idx is the last element in the array, the 16462306a36Sopenharmony_ci * array will be truncated. Otherwise, the entry will be zeroed. 16562306a36Sopenharmony_ci */ 16662306a36Sopenharmony_ciint 16762306a36Sopenharmony_cixfarray_unset( 16862306a36Sopenharmony_ci struct xfarray *array, 16962306a36Sopenharmony_ci xfarray_idx_t idx) 17062306a36Sopenharmony_ci{ 17162306a36Sopenharmony_ci void *temp = xfarray_scratch(array); 17262306a36Sopenharmony_ci loff_t pos = xfarray_pos(array, idx); 17362306a36Sopenharmony_ci int error; 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci if (idx >= array->nr) 17662306a36Sopenharmony_ci return -ENODATA; 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci if (idx == array->nr - 1) { 17962306a36Sopenharmony_ci array->nr--; 18062306a36Sopenharmony_ci return 0; 18162306a36Sopenharmony_ci } 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci if (xfarray_is_unset(array, pos)) 18462306a36Sopenharmony_ci return 0; 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci memset(temp, 0, array->obj_size); 18762306a36Sopenharmony_ci error = xfile_obj_store(array->xfile, temp, array->obj_size, pos); 18862306a36Sopenharmony_ci if (error) 18962306a36Sopenharmony_ci return error; 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci array->unset_slots++; 19262306a36Sopenharmony_ci return 0; 19362306a36Sopenharmony_ci} 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci/* 19662306a36Sopenharmony_ci * Store an element in the array. The element must not be completely zeroed, 19762306a36Sopenharmony_ci * because those are considered unset sparse elements. 19862306a36Sopenharmony_ci */ 19962306a36Sopenharmony_ciint 20062306a36Sopenharmony_cixfarray_store( 20162306a36Sopenharmony_ci struct xfarray *array, 20262306a36Sopenharmony_ci xfarray_idx_t idx, 20362306a36Sopenharmony_ci const void *ptr) 20462306a36Sopenharmony_ci{ 20562306a36Sopenharmony_ci int ret; 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci if (idx >= array->max_nr) 20862306a36Sopenharmony_ci return -EFBIG; 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci ASSERT(!xfarray_element_is_null(array, ptr)); 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci ret = xfile_obj_store(array->xfile, ptr, array->obj_size, 21362306a36Sopenharmony_ci xfarray_pos(array, idx)); 21462306a36Sopenharmony_ci if (ret) 21562306a36Sopenharmony_ci return ret; 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci array->nr = max(array->nr, idx + 1); 21862306a36Sopenharmony_ci return 0; 21962306a36Sopenharmony_ci} 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci/* Is this array element NULL? */ 22262306a36Sopenharmony_cibool 22362306a36Sopenharmony_cixfarray_element_is_null( 22462306a36Sopenharmony_ci struct xfarray *array, 22562306a36Sopenharmony_ci const void *ptr) 22662306a36Sopenharmony_ci{ 22762306a36Sopenharmony_ci return !memchr_inv(ptr, 0, array->obj_size); 22862306a36Sopenharmony_ci} 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci/* 23162306a36Sopenharmony_ci * Store an element anywhere in the array that is unset. If there are no 23262306a36Sopenharmony_ci * unset slots, append the element to the array. 23362306a36Sopenharmony_ci */ 23462306a36Sopenharmony_ciint 23562306a36Sopenharmony_cixfarray_store_anywhere( 23662306a36Sopenharmony_ci struct xfarray *array, 23762306a36Sopenharmony_ci const void *ptr) 23862306a36Sopenharmony_ci{ 23962306a36Sopenharmony_ci void *temp = xfarray_scratch(array); 24062306a36Sopenharmony_ci loff_t endpos = xfarray_pos(array, array->nr); 24162306a36Sopenharmony_ci loff_t pos; 24262306a36Sopenharmony_ci int error; 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci /* Find an unset slot to put it in. */ 24562306a36Sopenharmony_ci for (pos = 0; 24662306a36Sopenharmony_ci pos < endpos && array->unset_slots > 0; 24762306a36Sopenharmony_ci pos += array->obj_size) { 24862306a36Sopenharmony_ci error = xfile_obj_load(array->xfile, temp, array->obj_size, 24962306a36Sopenharmony_ci pos); 25062306a36Sopenharmony_ci if (error || !xfarray_element_is_null(array, temp)) 25162306a36Sopenharmony_ci continue; 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci error = xfile_obj_store(array->xfile, ptr, array->obj_size, 25462306a36Sopenharmony_ci pos); 25562306a36Sopenharmony_ci if (error) 25662306a36Sopenharmony_ci return error; 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci array->unset_slots--; 25962306a36Sopenharmony_ci return 0; 26062306a36Sopenharmony_ci } 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci /* No unset slots found; attach it on the end. */ 26362306a36Sopenharmony_ci array->unset_slots = 0; 26462306a36Sopenharmony_ci return xfarray_append(array, ptr); 26562306a36Sopenharmony_ci} 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci/* Return length of array. */ 26862306a36Sopenharmony_ciuint64_t 26962306a36Sopenharmony_cixfarray_length( 27062306a36Sopenharmony_ci struct xfarray *array) 27162306a36Sopenharmony_ci{ 27262306a36Sopenharmony_ci return array->nr; 27362306a36Sopenharmony_ci} 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci/* 27662306a36Sopenharmony_ci * Decide which array item we're going to read as part of an _iter_get. 27762306a36Sopenharmony_ci * @cur is the array index, and @pos is the file offset of that array index in 27862306a36Sopenharmony_ci * the backing xfile. Returns ENODATA if we reach the end of the records. 27962306a36Sopenharmony_ci * 28062306a36Sopenharmony_ci * Reading from a hole in a sparse xfile causes page instantiation, so for 28162306a36Sopenharmony_ci * iterating a (possibly sparse) array we need to figure out if the cursor is 28262306a36Sopenharmony_ci * pointing at a totally uninitialized hole and move the cursor up if 28362306a36Sopenharmony_ci * necessary. 28462306a36Sopenharmony_ci */ 28562306a36Sopenharmony_cistatic inline int 28662306a36Sopenharmony_cixfarray_find_data( 28762306a36Sopenharmony_ci struct xfarray *array, 28862306a36Sopenharmony_ci xfarray_idx_t *cur, 28962306a36Sopenharmony_ci loff_t *pos) 29062306a36Sopenharmony_ci{ 29162306a36Sopenharmony_ci unsigned int pgoff = offset_in_page(*pos); 29262306a36Sopenharmony_ci loff_t end_pos = *pos + array->obj_size - 1; 29362306a36Sopenharmony_ci loff_t new_pos; 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci /* 29662306a36Sopenharmony_ci * If the current array record is not adjacent to a page boundary, we 29762306a36Sopenharmony_ci * are in the middle of the page. We do not need to move the cursor. 29862306a36Sopenharmony_ci */ 29962306a36Sopenharmony_ci if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE) 30062306a36Sopenharmony_ci return 0; 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci /* 30362306a36Sopenharmony_ci * Call SEEK_DATA on the last byte in the record we're about to read. 30462306a36Sopenharmony_ci * If the record ends at (or crosses) the end of a page then we know 30562306a36Sopenharmony_ci * that the first byte of the record is backed by pages and don't need 30662306a36Sopenharmony_ci * to query it. If instead the record begins at the start of the page 30762306a36Sopenharmony_ci * then we know that querying the last byte is just as good as querying 30862306a36Sopenharmony_ci * the first byte, since records cannot be larger than a page. 30962306a36Sopenharmony_ci * 31062306a36Sopenharmony_ci * If the call returns the same file offset, we know this record is 31162306a36Sopenharmony_ci * backed by real pages. We do not need to move the cursor. 31262306a36Sopenharmony_ci */ 31362306a36Sopenharmony_ci new_pos = xfile_seek_data(array->xfile, end_pos); 31462306a36Sopenharmony_ci if (new_pos == -ENXIO) 31562306a36Sopenharmony_ci return -ENODATA; 31662306a36Sopenharmony_ci if (new_pos < 0) 31762306a36Sopenharmony_ci return new_pos; 31862306a36Sopenharmony_ci if (new_pos == end_pos) 31962306a36Sopenharmony_ci return 0; 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci /* 32262306a36Sopenharmony_ci * Otherwise, SEEK_DATA told us how far up to move the file pointer to 32362306a36Sopenharmony_ci * find more data. Move the array index to the first record past the 32462306a36Sopenharmony_ci * byte offset we were given. 32562306a36Sopenharmony_ci */ 32662306a36Sopenharmony_ci new_pos = roundup_64(new_pos, array->obj_size); 32762306a36Sopenharmony_ci *cur = xfarray_idx(array, new_pos); 32862306a36Sopenharmony_ci *pos = xfarray_pos(array, *cur); 32962306a36Sopenharmony_ci return 0; 33062306a36Sopenharmony_ci} 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci/* 33362306a36Sopenharmony_ci * Starting at *idx, fetch the next non-null array entry and advance the index 33462306a36Sopenharmony_ci * to set up the next _load_next call. Returns ENODATA if we reach the end of 33562306a36Sopenharmony_ci * the array. Callers must set @*idx to XFARRAY_CURSOR_INIT before the first 33662306a36Sopenharmony_ci * call to this function. 33762306a36Sopenharmony_ci */ 33862306a36Sopenharmony_ciint 33962306a36Sopenharmony_cixfarray_load_next( 34062306a36Sopenharmony_ci struct xfarray *array, 34162306a36Sopenharmony_ci xfarray_idx_t *idx, 34262306a36Sopenharmony_ci void *rec) 34362306a36Sopenharmony_ci{ 34462306a36Sopenharmony_ci xfarray_idx_t cur = *idx; 34562306a36Sopenharmony_ci loff_t pos = xfarray_pos(array, cur); 34662306a36Sopenharmony_ci int error; 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_ci do { 34962306a36Sopenharmony_ci if (cur >= array->nr) 35062306a36Sopenharmony_ci return -ENODATA; 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci /* 35362306a36Sopenharmony_ci * Ask the backing store for the location of next possible 35462306a36Sopenharmony_ci * written record, then retrieve that record. 35562306a36Sopenharmony_ci */ 35662306a36Sopenharmony_ci error = xfarray_find_data(array, &cur, &pos); 35762306a36Sopenharmony_ci if (error) 35862306a36Sopenharmony_ci return error; 35962306a36Sopenharmony_ci error = xfarray_load(array, cur, rec); 36062306a36Sopenharmony_ci if (error) 36162306a36Sopenharmony_ci return error; 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci cur++; 36462306a36Sopenharmony_ci pos += array->obj_size; 36562306a36Sopenharmony_ci } while (xfarray_element_is_null(array, rec)); 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci *idx = cur; 36862306a36Sopenharmony_ci return 0; 36962306a36Sopenharmony_ci} 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci/* Sorting functions */ 37262306a36Sopenharmony_ci 37362306a36Sopenharmony_ci#ifdef DEBUG 37462306a36Sopenharmony_ci# define xfarray_sort_bump_loads(si) do { (si)->loads++; } while (0) 37562306a36Sopenharmony_ci# define xfarray_sort_bump_stores(si) do { (si)->stores++; } while (0) 37662306a36Sopenharmony_ci# define xfarray_sort_bump_compares(si) do { (si)->compares++; } while (0) 37762306a36Sopenharmony_ci# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0) 37862306a36Sopenharmony_ci#else 37962306a36Sopenharmony_ci# define xfarray_sort_bump_loads(si) 38062306a36Sopenharmony_ci# define xfarray_sort_bump_stores(si) 38162306a36Sopenharmony_ci# define xfarray_sort_bump_compares(si) 38262306a36Sopenharmony_ci# define xfarray_sort_bump_heapsorts(si) 38362306a36Sopenharmony_ci#endif /* DEBUG */ 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci/* Load an array element for sorting. */ 38662306a36Sopenharmony_cistatic inline int 38762306a36Sopenharmony_cixfarray_sort_load( 38862306a36Sopenharmony_ci struct xfarray_sortinfo *si, 38962306a36Sopenharmony_ci xfarray_idx_t idx, 39062306a36Sopenharmony_ci void *ptr) 39162306a36Sopenharmony_ci{ 39262306a36Sopenharmony_ci xfarray_sort_bump_loads(si); 39362306a36Sopenharmony_ci return xfarray_load(si->array, idx, ptr); 39462306a36Sopenharmony_ci} 39562306a36Sopenharmony_ci 39662306a36Sopenharmony_ci/* Store an array element for sorting. */ 39762306a36Sopenharmony_cistatic inline int 39862306a36Sopenharmony_cixfarray_sort_store( 39962306a36Sopenharmony_ci struct xfarray_sortinfo *si, 40062306a36Sopenharmony_ci xfarray_idx_t idx, 40162306a36Sopenharmony_ci void *ptr) 40262306a36Sopenharmony_ci{ 40362306a36Sopenharmony_ci xfarray_sort_bump_stores(si); 40462306a36Sopenharmony_ci return xfarray_store(si->array, idx, ptr); 40562306a36Sopenharmony_ci} 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_ci/* Compare an array element for sorting. */ 40862306a36Sopenharmony_cistatic inline int 40962306a36Sopenharmony_cixfarray_sort_cmp( 41062306a36Sopenharmony_ci struct xfarray_sortinfo *si, 41162306a36Sopenharmony_ci const void *a, 41262306a36Sopenharmony_ci const void *b) 41362306a36Sopenharmony_ci{ 41462306a36Sopenharmony_ci xfarray_sort_bump_compares(si); 41562306a36Sopenharmony_ci return si->cmp_fn(a, b); 41662306a36Sopenharmony_ci} 41762306a36Sopenharmony_ci 41862306a36Sopenharmony_ci/* Return a pointer to the low index stack for quicksort partitioning. */ 41962306a36Sopenharmony_cistatic inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si) 42062306a36Sopenharmony_ci{ 42162306a36Sopenharmony_ci return (xfarray_idx_t *)(si + 1); 42262306a36Sopenharmony_ci} 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci/* Return a pointer to the high index stack for quicksort partitioning. */ 42562306a36Sopenharmony_cistatic inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si) 42662306a36Sopenharmony_ci{ 42762306a36Sopenharmony_ci return xfarray_sortinfo_lo(si) + si->max_stack_depth; 42862306a36Sopenharmony_ci} 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_ci/* Size of each element in the quicksort pivot array. */ 43162306a36Sopenharmony_cistatic inline size_t 43262306a36Sopenharmony_cixfarray_pivot_rec_sz( 43362306a36Sopenharmony_ci struct xfarray *array) 43462306a36Sopenharmony_ci{ 43562306a36Sopenharmony_ci return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t); 43662306a36Sopenharmony_ci} 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci/* Allocate memory to handle the sort. */ 43962306a36Sopenharmony_cistatic inline int 44062306a36Sopenharmony_cixfarray_sortinfo_alloc( 44162306a36Sopenharmony_ci struct xfarray *array, 44262306a36Sopenharmony_ci xfarray_cmp_fn cmp_fn, 44362306a36Sopenharmony_ci unsigned int flags, 44462306a36Sopenharmony_ci struct xfarray_sortinfo **infop) 44562306a36Sopenharmony_ci{ 44662306a36Sopenharmony_ci struct xfarray_sortinfo *si; 44762306a36Sopenharmony_ci size_t nr_bytes = sizeof(struct xfarray_sortinfo); 44862306a36Sopenharmony_ci size_t pivot_rec_sz = xfarray_pivot_rec_sz(array); 44962306a36Sopenharmony_ci int max_stack_depth; 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci /* 45262306a36Sopenharmony_ci * The median-of-nine pivot algorithm doesn't work if a subset has 45362306a36Sopenharmony_ci * fewer than 9 items. Make sure the in-memory sort will always take 45462306a36Sopenharmony_ci * over for subsets where this wouldn't be the case. 45562306a36Sopenharmony_ci */ 45662306a36Sopenharmony_ci BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR); 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci /* 45962306a36Sopenharmony_ci * Tail-call recursion during the partitioning phase means that 46062306a36Sopenharmony_ci * quicksort will never recurse more than log2(nr) times. We need one 46162306a36Sopenharmony_ci * extra level of stack to hold the initial parameters. In-memory 46262306a36Sopenharmony_ci * sort will always take care of the last few levels of recursion for 46362306a36Sopenharmony_ci * us, so we can reduce the stack depth by that much. 46462306a36Sopenharmony_ci */ 46562306a36Sopenharmony_ci max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1); 46662306a36Sopenharmony_ci if (max_stack_depth < 1) 46762306a36Sopenharmony_ci max_stack_depth = 1; 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci /* Each level of quicksort uses a lo and a hi index */ 47062306a36Sopenharmony_ci nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2; 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci /* Scratchpad for in-memory sort, or finding the pivot */ 47362306a36Sopenharmony_ci nr_bytes += max_t(size_t, 47462306a36Sopenharmony_ci (XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz, 47562306a36Sopenharmony_ci XFARRAY_ISORT_NR * array->obj_size); 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS); 47862306a36Sopenharmony_ci if (!si) 47962306a36Sopenharmony_ci return -ENOMEM; 48062306a36Sopenharmony_ci 48162306a36Sopenharmony_ci si->array = array; 48262306a36Sopenharmony_ci si->cmp_fn = cmp_fn; 48362306a36Sopenharmony_ci si->flags = flags; 48462306a36Sopenharmony_ci si->max_stack_depth = max_stack_depth; 48562306a36Sopenharmony_ci si->max_stack_used = 1; 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci xfarray_sortinfo_lo(si)[0] = 0; 48862306a36Sopenharmony_ci xfarray_sortinfo_hi(si)[0] = array->nr - 1; 48962306a36Sopenharmony_ci 49062306a36Sopenharmony_ci trace_xfarray_sort(si, nr_bytes); 49162306a36Sopenharmony_ci *infop = si; 49262306a36Sopenharmony_ci return 0; 49362306a36Sopenharmony_ci} 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci/* Should this sort be terminated by a fatal signal? */ 49662306a36Sopenharmony_cistatic inline bool 49762306a36Sopenharmony_cixfarray_sort_terminated( 49862306a36Sopenharmony_ci struct xfarray_sortinfo *si, 49962306a36Sopenharmony_ci int *error) 50062306a36Sopenharmony_ci{ 50162306a36Sopenharmony_ci /* 50262306a36Sopenharmony_ci * If preemption is disabled, we need to yield to the scheduler every 50362306a36Sopenharmony_ci * few seconds so that we don't run afoul of the soft lockup watchdog 50462306a36Sopenharmony_ci * or RCU stall detector. 50562306a36Sopenharmony_ci */ 50662306a36Sopenharmony_ci cond_resched(); 50762306a36Sopenharmony_ci 50862306a36Sopenharmony_ci if ((si->flags & XFARRAY_SORT_KILLABLE) && 50962306a36Sopenharmony_ci fatal_signal_pending(current)) { 51062306a36Sopenharmony_ci if (*error == 0) 51162306a36Sopenharmony_ci *error = -EINTR; 51262306a36Sopenharmony_ci return true; 51362306a36Sopenharmony_ci } 51462306a36Sopenharmony_ci return false; 51562306a36Sopenharmony_ci} 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci/* Do we want an in-memory sort? */ 51862306a36Sopenharmony_cistatic inline bool 51962306a36Sopenharmony_cixfarray_want_isort( 52062306a36Sopenharmony_ci struct xfarray_sortinfo *si, 52162306a36Sopenharmony_ci xfarray_idx_t start, 52262306a36Sopenharmony_ci xfarray_idx_t end) 52362306a36Sopenharmony_ci{ 52462306a36Sopenharmony_ci /* 52562306a36Sopenharmony_ci * For array subsets that fit in the scratchpad, it's much faster to 52662306a36Sopenharmony_ci * use the kernel's heapsort than quicksort's stack machine. 52762306a36Sopenharmony_ci */ 52862306a36Sopenharmony_ci return (end - start) < XFARRAY_ISORT_NR; 52962306a36Sopenharmony_ci} 53062306a36Sopenharmony_ci 53162306a36Sopenharmony_ci/* Return the scratch space within the sortinfo structure. */ 53262306a36Sopenharmony_cistatic inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si) 53362306a36Sopenharmony_ci{ 53462306a36Sopenharmony_ci return xfarray_sortinfo_hi(si) + si->max_stack_depth; 53562306a36Sopenharmony_ci} 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci/* 53862306a36Sopenharmony_ci * Sort a small number of array records using scratchpad memory. The records 53962306a36Sopenharmony_ci * need not be contiguous in the xfile's memory pages. 54062306a36Sopenharmony_ci */ 54162306a36Sopenharmony_ciSTATIC int 54262306a36Sopenharmony_cixfarray_isort( 54362306a36Sopenharmony_ci struct xfarray_sortinfo *si, 54462306a36Sopenharmony_ci xfarray_idx_t lo, 54562306a36Sopenharmony_ci xfarray_idx_t hi) 54662306a36Sopenharmony_ci{ 54762306a36Sopenharmony_ci void *scratch = xfarray_sortinfo_isort_scratch(si); 54862306a36Sopenharmony_ci loff_t lo_pos = xfarray_pos(si->array, lo); 54962306a36Sopenharmony_ci loff_t len = xfarray_pos(si->array, hi - lo + 1); 55062306a36Sopenharmony_ci int error; 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci trace_xfarray_isort(si, lo, hi); 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci xfarray_sort_bump_loads(si); 55562306a36Sopenharmony_ci error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos); 55662306a36Sopenharmony_ci if (error) 55762306a36Sopenharmony_ci return error; 55862306a36Sopenharmony_ci 55962306a36Sopenharmony_ci xfarray_sort_bump_heapsorts(si); 56062306a36Sopenharmony_ci sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); 56162306a36Sopenharmony_ci 56262306a36Sopenharmony_ci xfarray_sort_bump_stores(si); 56362306a36Sopenharmony_ci return xfile_obj_store(si->array->xfile, scratch, len, lo_pos); 56462306a36Sopenharmony_ci} 56562306a36Sopenharmony_ci 56662306a36Sopenharmony_ci/* Grab a page for sorting records. */ 56762306a36Sopenharmony_cistatic inline int 56862306a36Sopenharmony_cixfarray_sort_get_page( 56962306a36Sopenharmony_ci struct xfarray_sortinfo *si, 57062306a36Sopenharmony_ci loff_t pos, 57162306a36Sopenharmony_ci uint64_t len) 57262306a36Sopenharmony_ci{ 57362306a36Sopenharmony_ci int error; 57462306a36Sopenharmony_ci 57562306a36Sopenharmony_ci error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage); 57662306a36Sopenharmony_ci if (error) 57762306a36Sopenharmony_ci return error; 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci /* 58062306a36Sopenharmony_ci * xfile pages must never be mapped into userspace, so we skip the 58162306a36Sopenharmony_ci * dcache flush when mapping the page. 58262306a36Sopenharmony_ci */ 58362306a36Sopenharmony_ci si->page_kaddr = kmap_local_page(si->xfpage.page); 58462306a36Sopenharmony_ci return 0; 58562306a36Sopenharmony_ci} 58662306a36Sopenharmony_ci 58762306a36Sopenharmony_ci/* Release a page we grabbed for sorting records. */ 58862306a36Sopenharmony_cistatic inline int 58962306a36Sopenharmony_cixfarray_sort_put_page( 59062306a36Sopenharmony_ci struct xfarray_sortinfo *si) 59162306a36Sopenharmony_ci{ 59262306a36Sopenharmony_ci if (!si->page_kaddr) 59362306a36Sopenharmony_ci return 0; 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci kunmap_local(si->page_kaddr); 59662306a36Sopenharmony_ci si->page_kaddr = NULL; 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci return xfile_put_page(si->array->xfile, &si->xfpage); 59962306a36Sopenharmony_ci} 60062306a36Sopenharmony_ci 60162306a36Sopenharmony_ci/* Decide if these records are eligible for in-page sorting. */ 60262306a36Sopenharmony_cistatic inline bool 60362306a36Sopenharmony_cixfarray_want_pagesort( 60462306a36Sopenharmony_ci struct xfarray_sortinfo *si, 60562306a36Sopenharmony_ci xfarray_idx_t lo, 60662306a36Sopenharmony_ci xfarray_idx_t hi) 60762306a36Sopenharmony_ci{ 60862306a36Sopenharmony_ci pgoff_t lo_page; 60962306a36Sopenharmony_ci pgoff_t hi_page; 61062306a36Sopenharmony_ci loff_t end_pos; 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci /* We can only map one page at a time. */ 61362306a36Sopenharmony_ci lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT; 61462306a36Sopenharmony_ci end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1; 61562306a36Sopenharmony_ci hi_page = end_pos >> PAGE_SHIFT; 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci return lo_page == hi_page; 61862306a36Sopenharmony_ci} 61962306a36Sopenharmony_ci 62062306a36Sopenharmony_ci/* Sort a bunch of records that all live in the same memory page. */ 62162306a36Sopenharmony_ciSTATIC int 62262306a36Sopenharmony_cixfarray_pagesort( 62362306a36Sopenharmony_ci struct xfarray_sortinfo *si, 62462306a36Sopenharmony_ci xfarray_idx_t lo, 62562306a36Sopenharmony_ci xfarray_idx_t hi) 62662306a36Sopenharmony_ci{ 62762306a36Sopenharmony_ci void *startp; 62862306a36Sopenharmony_ci loff_t lo_pos = xfarray_pos(si->array, lo); 62962306a36Sopenharmony_ci uint64_t len = xfarray_pos(si->array, hi - lo); 63062306a36Sopenharmony_ci int error = 0; 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci trace_xfarray_pagesort(si, lo, hi); 63362306a36Sopenharmony_ci 63462306a36Sopenharmony_ci xfarray_sort_bump_loads(si); 63562306a36Sopenharmony_ci error = xfarray_sort_get_page(si, lo_pos, len); 63662306a36Sopenharmony_ci if (error) 63762306a36Sopenharmony_ci return error; 63862306a36Sopenharmony_ci 63962306a36Sopenharmony_ci xfarray_sort_bump_heapsorts(si); 64062306a36Sopenharmony_ci startp = si->page_kaddr + offset_in_page(lo_pos); 64162306a36Sopenharmony_ci sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); 64262306a36Sopenharmony_ci 64362306a36Sopenharmony_ci xfarray_sort_bump_stores(si); 64462306a36Sopenharmony_ci return xfarray_sort_put_page(si); 64562306a36Sopenharmony_ci} 64662306a36Sopenharmony_ci 64762306a36Sopenharmony_ci/* Return a pointer to the xfarray pivot record within the sortinfo struct. */ 64862306a36Sopenharmony_cistatic inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si) 64962306a36Sopenharmony_ci{ 65062306a36Sopenharmony_ci return xfarray_sortinfo_hi(si) + si->max_stack_depth; 65162306a36Sopenharmony_ci} 65262306a36Sopenharmony_ci 65362306a36Sopenharmony_ci/* Return a pointer to the start of the pivot array. */ 65462306a36Sopenharmony_cistatic inline void * 65562306a36Sopenharmony_cixfarray_sortinfo_pivot_array( 65662306a36Sopenharmony_ci struct xfarray_sortinfo *si) 65762306a36Sopenharmony_ci{ 65862306a36Sopenharmony_ci return xfarray_sortinfo_pivot(si) + si->array->obj_size; 65962306a36Sopenharmony_ci} 66062306a36Sopenharmony_ci 66162306a36Sopenharmony_ci/* The xfarray record is stored at the start of each pivot array element. */ 66262306a36Sopenharmony_cistatic inline void * 66362306a36Sopenharmony_cixfarray_pivot_array_rec( 66462306a36Sopenharmony_ci void *pa, 66562306a36Sopenharmony_ci size_t pa_recsz, 66662306a36Sopenharmony_ci unsigned int pa_idx) 66762306a36Sopenharmony_ci{ 66862306a36Sopenharmony_ci return pa + (pa_recsz * pa_idx); 66962306a36Sopenharmony_ci} 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci/* The xfarray index is stored at the end of each pivot array element. */ 67262306a36Sopenharmony_cistatic inline xfarray_idx_t * 67362306a36Sopenharmony_cixfarray_pivot_array_idx( 67462306a36Sopenharmony_ci void *pa, 67562306a36Sopenharmony_ci size_t pa_recsz, 67662306a36Sopenharmony_ci unsigned int pa_idx) 67762306a36Sopenharmony_ci{ 67862306a36Sopenharmony_ci return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) - 67962306a36Sopenharmony_ci sizeof(xfarray_idx_t); 68062306a36Sopenharmony_ci} 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_ci/* 68362306a36Sopenharmony_ci * Find a pivot value for quicksort partitioning, swap it with a[lo], and save 68462306a36Sopenharmony_ci * the cached pivot record for the next step. 68562306a36Sopenharmony_ci * 68662306a36Sopenharmony_ci * Load evenly-spaced records within the given range into memory, sort them, 68762306a36Sopenharmony_ci * and choose the pivot from the median record. Using multiple points will 68862306a36Sopenharmony_ci * improve the quality of the pivot selection, and hopefully avoid the worst 68962306a36Sopenharmony_ci * quicksort behavior, since our array values are nearly always evenly sorted. 69062306a36Sopenharmony_ci */ 69162306a36Sopenharmony_ciSTATIC int 69262306a36Sopenharmony_cixfarray_qsort_pivot( 69362306a36Sopenharmony_ci struct xfarray_sortinfo *si, 69462306a36Sopenharmony_ci xfarray_idx_t lo, 69562306a36Sopenharmony_ci xfarray_idx_t hi) 69662306a36Sopenharmony_ci{ 69762306a36Sopenharmony_ci void *pivot = xfarray_sortinfo_pivot(si); 69862306a36Sopenharmony_ci void *parray = xfarray_sortinfo_pivot_array(si); 69962306a36Sopenharmony_ci void *recp; 70062306a36Sopenharmony_ci xfarray_idx_t *idxp; 70162306a36Sopenharmony_ci xfarray_idx_t step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1); 70262306a36Sopenharmony_ci size_t pivot_rec_sz = xfarray_pivot_rec_sz(si->array); 70362306a36Sopenharmony_ci int i, j; 70462306a36Sopenharmony_ci int error; 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_ci ASSERT(step > 0); 70762306a36Sopenharmony_ci 70862306a36Sopenharmony_ci /* 70962306a36Sopenharmony_ci * Load the xfarray indexes of the records we intend to sample into the 71062306a36Sopenharmony_ci * pivot array. 71162306a36Sopenharmony_ci */ 71262306a36Sopenharmony_ci idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0); 71362306a36Sopenharmony_ci *idxp = lo; 71462306a36Sopenharmony_ci for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) { 71562306a36Sopenharmony_ci idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); 71662306a36Sopenharmony_ci *idxp = lo + (i * step); 71762306a36Sopenharmony_ci } 71862306a36Sopenharmony_ci idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 71962306a36Sopenharmony_ci XFARRAY_QSORT_PIVOT_NR - 1); 72062306a36Sopenharmony_ci *idxp = hi; 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_ci /* Load the selected xfarray records into the pivot array. */ 72362306a36Sopenharmony_ci for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) { 72462306a36Sopenharmony_ci xfarray_idx_t idx; 72562306a36Sopenharmony_ci 72662306a36Sopenharmony_ci recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i); 72762306a36Sopenharmony_ci idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); 72862306a36Sopenharmony_ci 72962306a36Sopenharmony_ci /* No unset records; load directly into the array. */ 73062306a36Sopenharmony_ci if (likely(si->array->unset_slots == 0)) { 73162306a36Sopenharmony_ci error = xfarray_sort_load(si, *idxp, recp); 73262306a36Sopenharmony_ci if (error) 73362306a36Sopenharmony_ci return error; 73462306a36Sopenharmony_ci continue; 73562306a36Sopenharmony_ci } 73662306a36Sopenharmony_ci 73762306a36Sopenharmony_ci /* 73862306a36Sopenharmony_ci * Load non-null records into the scratchpad without changing 73962306a36Sopenharmony_ci * the xfarray_idx_t in the pivot array. 74062306a36Sopenharmony_ci */ 74162306a36Sopenharmony_ci idx = *idxp; 74262306a36Sopenharmony_ci xfarray_sort_bump_loads(si); 74362306a36Sopenharmony_ci error = xfarray_load_next(si->array, &idx, recp); 74462306a36Sopenharmony_ci if (error) 74562306a36Sopenharmony_ci return error; 74662306a36Sopenharmony_ci } 74762306a36Sopenharmony_ci 74862306a36Sopenharmony_ci xfarray_sort_bump_heapsorts(si); 74962306a36Sopenharmony_ci sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL); 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_ci /* 75262306a36Sopenharmony_ci * We sorted the pivot array records (which includes the xfarray 75362306a36Sopenharmony_ci * indices) in xfarray record order. The median element of the pivot 75462306a36Sopenharmony_ci * array contains the xfarray record that we will use as the pivot. 75562306a36Sopenharmony_ci * Copy that xfarray record to the designated space. 75662306a36Sopenharmony_ci */ 75762306a36Sopenharmony_ci recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, 75862306a36Sopenharmony_ci XFARRAY_QSORT_PIVOT_NR / 2); 75962306a36Sopenharmony_ci memcpy(pivot, recp, si->array->obj_size); 76062306a36Sopenharmony_ci 76162306a36Sopenharmony_ci /* If the pivot record we chose was already in a[lo] then we're done. */ 76262306a36Sopenharmony_ci idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 76362306a36Sopenharmony_ci XFARRAY_QSORT_PIVOT_NR / 2); 76462306a36Sopenharmony_ci if (*idxp == lo) 76562306a36Sopenharmony_ci return 0; 76662306a36Sopenharmony_ci 76762306a36Sopenharmony_ci /* 76862306a36Sopenharmony_ci * Find the cached copy of a[lo] in the pivot array so that we can swap 76962306a36Sopenharmony_ci * a[lo] and a[pivot]. 77062306a36Sopenharmony_ci */ 77162306a36Sopenharmony_ci for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) { 77262306a36Sopenharmony_ci idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); 77362306a36Sopenharmony_ci if (*idxp == lo) 77462306a36Sopenharmony_ci j = i; 77562306a36Sopenharmony_ci } 77662306a36Sopenharmony_ci if (j < 0) { 77762306a36Sopenharmony_ci ASSERT(j >= 0); 77862306a36Sopenharmony_ci return -EFSCORRUPTED; 77962306a36Sopenharmony_ci } 78062306a36Sopenharmony_ci 78162306a36Sopenharmony_ci /* Swap a[lo] and a[pivot]. */ 78262306a36Sopenharmony_ci error = xfarray_sort_store(si, lo, pivot); 78362306a36Sopenharmony_ci if (error) 78462306a36Sopenharmony_ci return error; 78562306a36Sopenharmony_ci 78662306a36Sopenharmony_ci recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j); 78762306a36Sopenharmony_ci idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 78862306a36Sopenharmony_ci XFARRAY_QSORT_PIVOT_NR / 2); 78962306a36Sopenharmony_ci return xfarray_sort_store(si, *idxp, recp); 79062306a36Sopenharmony_ci} 79162306a36Sopenharmony_ci 79262306a36Sopenharmony_ci/* 79362306a36Sopenharmony_ci * Set up the pointers for the next iteration. We push onto the stack all of 79462306a36Sopenharmony_ci * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the 79562306a36Sopenharmony_ci * current stack frame to point to the unsorted values between a[beg[i]] and 79662306a36Sopenharmony_ci * a[lo] so that those values will be sorted when we pop the stack. 79762306a36Sopenharmony_ci */ 79862306a36Sopenharmony_cistatic inline int 79962306a36Sopenharmony_cixfarray_qsort_push( 80062306a36Sopenharmony_ci struct xfarray_sortinfo *si, 80162306a36Sopenharmony_ci xfarray_idx_t *si_lo, 80262306a36Sopenharmony_ci xfarray_idx_t *si_hi, 80362306a36Sopenharmony_ci xfarray_idx_t lo, 80462306a36Sopenharmony_ci xfarray_idx_t hi) 80562306a36Sopenharmony_ci{ 80662306a36Sopenharmony_ci /* Check for stack overflows */ 80762306a36Sopenharmony_ci if (si->stack_depth >= si->max_stack_depth - 1) { 80862306a36Sopenharmony_ci ASSERT(si->stack_depth < si->max_stack_depth - 1); 80962306a36Sopenharmony_ci return -EFSCORRUPTED; 81062306a36Sopenharmony_ci } 81162306a36Sopenharmony_ci 81262306a36Sopenharmony_ci si->max_stack_used = max_t(uint8_t, si->max_stack_used, 81362306a36Sopenharmony_ci si->stack_depth + 2); 81462306a36Sopenharmony_ci 81562306a36Sopenharmony_ci si_lo[si->stack_depth + 1] = lo + 1; 81662306a36Sopenharmony_ci si_hi[si->stack_depth + 1] = si_hi[si->stack_depth]; 81762306a36Sopenharmony_ci si_hi[si->stack_depth++] = lo - 1; 81862306a36Sopenharmony_ci 81962306a36Sopenharmony_ci /* 82062306a36Sopenharmony_ci * Always start with the smaller of the two partitions to keep the 82162306a36Sopenharmony_ci * amount of recursion in check. 82262306a36Sopenharmony_ci */ 82362306a36Sopenharmony_ci if (si_hi[si->stack_depth] - si_lo[si->stack_depth] > 82462306a36Sopenharmony_ci si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) { 82562306a36Sopenharmony_ci swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]); 82662306a36Sopenharmony_ci swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]); 82762306a36Sopenharmony_ci } 82862306a36Sopenharmony_ci 82962306a36Sopenharmony_ci return 0; 83062306a36Sopenharmony_ci} 83162306a36Sopenharmony_ci 83262306a36Sopenharmony_ci/* 83362306a36Sopenharmony_ci * Load an element from the array into the first scratchpad and cache the page, 83462306a36Sopenharmony_ci * if possible. 83562306a36Sopenharmony_ci */ 83662306a36Sopenharmony_cistatic inline int 83762306a36Sopenharmony_cixfarray_sort_load_cached( 83862306a36Sopenharmony_ci struct xfarray_sortinfo *si, 83962306a36Sopenharmony_ci xfarray_idx_t idx, 84062306a36Sopenharmony_ci void *ptr) 84162306a36Sopenharmony_ci{ 84262306a36Sopenharmony_ci loff_t idx_pos = xfarray_pos(si->array, idx); 84362306a36Sopenharmony_ci pgoff_t startpage; 84462306a36Sopenharmony_ci pgoff_t endpage; 84562306a36Sopenharmony_ci int error = 0; 84662306a36Sopenharmony_ci 84762306a36Sopenharmony_ci /* 84862306a36Sopenharmony_ci * If this load would split a page, release the cached page, if any, 84962306a36Sopenharmony_ci * and perform a traditional read. 85062306a36Sopenharmony_ci */ 85162306a36Sopenharmony_ci startpage = idx_pos >> PAGE_SHIFT; 85262306a36Sopenharmony_ci endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT; 85362306a36Sopenharmony_ci if (startpage != endpage) { 85462306a36Sopenharmony_ci error = xfarray_sort_put_page(si); 85562306a36Sopenharmony_ci if (error) 85662306a36Sopenharmony_ci return error; 85762306a36Sopenharmony_ci 85862306a36Sopenharmony_ci if (xfarray_sort_terminated(si, &error)) 85962306a36Sopenharmony_ci return error; 86062306a36Sopenharmony_ci 86162306a36Sopenharmony_ci return xfile_obj_load(si->array->xfile, ptr, 86262306a36Sopenharmony_ci si->array->obj_size, idx_pos); 86362306a36Sopenharmony_ci } 86462306a36Sopenharmony_ci 86562306a36Sopenharmony_ci /* If the cached page is not the one we want, release it. */ 86662306a36Sopenharmony_ci if (xfile_page_cached(&si->xfpage) && 86762306a36Sopenharmony_ci xfile_page_index(&si->xfpage) != startpage) { 86862306a36Sopenharmony_ci error = xfarray_sort_put_page(si); 86962306a36Sopenharmony_ci if (error) 87062306a36Sopenharmony_ci return error; 87162306a36Sopenharmony_ci } 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci /* 87462306a36Sopenharmony_ci * If we don't have a cached page (and we know the load is contained 87562306a36Sopenharmony_ci * in a single page) then grab it. 87662306a36Sopenharmony_ci */ 87762306a36Sopenharmony_ci if (!xfile_page_cached(&si->xfpage)) { 87862306a36Sopenharmony_ci if (xfarray_sort_terminated(si, &error)) 87962306a36Sopenharmony_ci return error; 88062306a36Sopenharmony_ci 88162306a36Sopenharmony_ci error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT, 88262306a36Sopenharmony_ci PAGE_SIZE); 88362306a36Sopenharmony_ci if (error) 88462306a36Sopenharmony_ci return error; 88562306a36Sopenharmony_ci } 88662306a36Sopenharmony_ci 88762306a36Sopenharmony_ci memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos), 88862306a36Sopenharmony_ci si->array->obj_size); 88962306a36Sopenharmony_ci return 0; 89062306a36Sopenharmony_ci} 89162306a36Sopenharmony_ci 89262306a36Sopenharmony_ci/* 89362306a36Sopenharmony_ci * Sort the array elements via quicksort. This implementation incorporates 89462306a36Sopenharmony_ci * four optimizations discussed in Sedgewick: 89562306a36Sopenharmony_ci * 89662306a36Sopenharmony_ci * 1. Use an explicit stack of array indices to store the next array partition 89762306a36Sopenharmony_ci * to sort. This helps us to avoid recursion in the call stack, which is 89862306a36Sopenharmony_ci * particularly expensive in the kernel. 89962306a36Sopenharmony_ci * 90062306a36Sopenharmony_ci * 2. For arrays with records in arbitrary or user-controlled order, choose the 90162306a36Sopenharmony_ci * pivot element using a median-of-nine decision tree. This reduces the 90262306a36Sopenharmony_ci * probability of selecting a bad pivot value which causes worst case 90362306a36Sopenharmony_ci * behavior (i.e. partition sizes of 1). 90462306a36Sopenharmony_ci * 90562306a36Sopenharmony_ci * 3. The smaller of the two sub-partitions is pushed onto the stack to start 90662306a36Sopenharmony_ci * the next level of recursion, and the larger sub-partition replaces the 90762306a36Sopenharmony_ci * current stack frame. This guarantees that we won't need more than 90862306a36Sopenharmony_ci * log2(nr) stack space. 90962306a36Sopenharmony_ci * 91062306a36Sopenharmony_ci * 4. For small sets, load the records into the scratchpad and run heapsort on 91162306a36Sopenharmony_ci * them because that is very fast. In the author's experience, this yields 91262306a36Sopenharmony_ci * a ~10% reduction in runtime. 91362306a36Sopenharmony_ci * 91462306a36Sopenharmony_ci * If a small set is contained entirely within a single xfile memory page, 91562306a36Sopenharmony_ci * map the page directly and run heap sort directly on the xfile page 91662306a36Sopenharmony_ci * instead of using the load/store interface. This halves the runtime. 91762306a36Sopenharmony_ci * 91862306a36Sopenharmony_ci * 5. This optimization is specific to the implementation. When converging lo 91962306a36Sopenharmony_ci * and hi after selecting a pivot, we will try to retain the xfile memory 92062306a36Sopenharmony_ci * page between load calls, which reduces run time by 50%. 92162306a36Sopenharmony_ci */ 92262306a36Sopenharmony_ci 92362306a36Sopenharmony_ci/* 92462306a36Sopenharmony_ci * Due to the use of signed indices, we can only support up to 2^63 records. 92562306a36Sopenharmony_ci * Files can only grow to 2^63 bytes, so this is not much of a limitation. 92662306a36Sopenharmony_ci */ 92762306a36Sopenharmony_ci#define QSORT_MAX_RECS (1ULL << 63) 92862306a36Sopenharmony_ci 92962306a36Sopenharmony_ciint 93062306a36Sopenharmony_cixfarray_sort( 93162306a36Sopenharmony_ci struct xfarray *array, 93262306a36Sopenharmony_ci xfarray_cmp_fn cmp_fn, 93362306a36Sopenharmony_ci unsigned int flags) 93462306a36Sopenharmony_ci{ 93562306a36Sopenharmony_ci struct xfarray_sortinfo *si; 93662306a36Sopenharmony_ci xfarray_idx_t *si_lo, *si_hi; 93762306a36Sopenharmony_ci void *pivot; 93862306a36Sopenharmony_ci void *scratch = xfarray_scratch(array); 93962306a36Sopenharmony_ci xfarray_idx_t lo, hi; 94062306a36Sopenharmony_ci int error = 0; 94162306a36Sopenharmony_ci 94262306a36Sopenharmony_ci if (array->nr < 2) 94362306a36Sopenharmony_ci return 0; 94462306a36Sopenharmony_ci if (array->nr >= QSORT_MAX_RECS) 94562306a36Sopenharmony_ci return -E2BIG; 94662306a36Sopenharmony_ci 94762306a36Sopenharmony_ci error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si); 94862306a36Sopenharmony_ci if (error) 94962306a36Sopenharmony_ci return error; 95062306a36Sopenharmony_ci si_lo = xfarray_sortinfo_lo(si); 95162306a36Sopenharmony_ci si_hi = xfarray_sortinfo_hi(si); 95262306a36Sopenharmony_ci pivot = xfarray_sortinfo_pivot(si); 95362306a36Sopenharmony_ci 95462306a36Sopenharmony_ci while (si->stack_depth >= 0) { 95562306a36Sopenharmony_ci lo = si_lo[si->stack_depth]; 95662306a36Sopenharmony_ci hi = si_hi[si->stack_depth]; 95762306a36Sopenharmony_ci 95862306a36Sopenharmony_ci trace_xfarray_qsort(si, lo, hi); 95962306a36Sopenharmony_ci 96062306a36Sopenharmony_ci /* Nothing left in this partition to sort; pop stack. */ 96162306a36Sopenharmony_ci if (lo >= hi) { 96262306a36Sopenharmony_ci si->stack_depth--; 96362306a36Sopenharmony_ci continue; 96462306a36Sopenharmony_ci } 96562306a36Sopenharmony_ci 96662306a36Sopenharmony_ci /* 96762306a36Sopenharmony_ci * If directly mapping the page and sorting can solve our 96862306a36Sopenharmony_ci * problems, we're done. 96962306a36Sopenharmony_ci */ 97062306a36Sopenharmony_ci if (xfarray_want_pagesort(si, lo, hi)) { 97162306a36Sopenharmony_ci error = xfarray_pagesort(si, lo, hi); 97262306a36Sopenharmony_ci if (error) 97362306a36Sopenharmony_ci goto out_free; 97462306a36Sopenharmony_ci si->stack_depth--; 97562306a36Sopenharmony_ci continue; 97662306a36Sopenharmony_ci } 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_ci /* If insertion sort can solve our problems, we're done. */ 97962306a36Sopenharmony_ci if (xfarray_want_isort(si, lo, hi)) { 98062306a36Sopenharmony_ci error = xfarray_isort(si, lo, hi); 98162306a36Sopenharmony_ci if (error) 98262306a36Sopenharmony_ci goto out_free; 98362306a36Sopenharmony_ci si->stack_depth--; 98462306a36Sopenharmony_ci continue; 98562306a36Sopenharmony_ci } 98662306a36Sopenharmony_ci 98762306a36Sopenharmony_ci /* Pick a pivot, move it to a[lo] and stash it. */ 98862306a36Sopenharmony_ci error = xfarray_qsort_pivot(si, lo, hi); 98962306a36Sopenharmony_ci if (error) 99062306a36Sopenharmony_ci goto out_free; 99162306a36Sopenharmony_ci 99262306a36Sopenharmony_ci /* 99362306a36Sopenharmony_ci * Rearrange a[lo..hi] such that everything smaller than the 99462306a36Sopenharmony_ci * pivot is on the left side of the range and everything larger 99562306a36Sopenharmony_ci * than the pivot is on the right side of the range. 99662306a36Sopenharmony_ci */ 99762306a36Sopenharmony_ci while (lo < hi) { 99862306a36Sopenharmony_ci /* 99962306a36Sopenharmony_ci * Decrement hi until it finds an a[hi] less than the 100062306a36Sopenharmony_ci * pivot value. 100162306a36Sopenharmony_ci */ 100262306a36Sopenharmony_ci error = xfarray_sort_load_cached(si, hi, scratch); 100362306a36Sopenharmony_ci if (error) 100462306a36Sopenharmony_ci goto out_free; 100562306a36Sopenharmony_ci while (xfarray_sort_cmp(si, scratch, pivot) >= 0 && 100662306a36Sopenharmony_ci lo < hi) { 100762306a36Sopenharmony_ci hi--; 100862306a36Sopenharmony_ci error = xfarray_sort_load_cached(si, hi, 100962306a36Sopenharmony_ci scratch); 101062306a36Sopenharmony_ci if (error) 101162306a36Sopenharmony_ci goto out_free; 101262306a36Sopenharmony_ci } 101362306a36Sopenharmony_ci error = xfarray_sort_put_page(si); 101462306a36Sopenharmony_ci if (error) 101562306a36Sopenharmony_ci goto out_free; 101662306a36Sopenharmony_ci 101762306a36Sopenharmony_ci if (xfarray_sort_terminated(si, &error)) 101862306a36Sopenharmony_ci goto out_free; 101962306a36Sopenharmony_ci 102062306a36Sopenharmony_ci /* Copy that item (a[hi]) to a[lo]. */ 102162306a36Sopenharmony_ci if (lo < hi) { 102262306a36Sopenharmony_ci error = xfarray_sort_store(si, lo++, scratch); 102362306a36Sopenharmony_ci if (error) 102462306a36Sopenharmony_ci goto out_free; 102562306a36Sopenharmony_ci } 102662306a36Sopenharmony_ci 102762306a36Sopenharmony_ci /* 102862306a36Sopenharmony_ci * Increment lo until it finds an a[lo] greater than 102962306a36Sopenharmony_ci * the pivot value. 103062306a36Sopenharmony_ci */ 103162306a36Sopenharmony_ci error = xfarray_sort_load_cached(si, lo, scratch); 103262306a36Sopenharmony_ci if (error) 103362306a36Sopenharmony_ci goto out_free; 103462306a36Sopenharmony_ci while (xfarray_sort_cmp(si, scratch, pivot) <= 0 && 103562306a36Sopenharmony_ci lo < hi) { 103662306a36Sopenharmony_ci lo++; 103762306a36Sopenharmony_ci error = xfarray_sort_load_cached(si, lo, 103862306a36Sopenharmony_ci scratch); 103962306a36Sopenharmony_ci if (error) 104062306a36Sopenharmony_ci goto out_free; 104162306a36Sopenharmony_ci } 104262306a36Sopenharmony_ci error = xfarray_sort_put_page(si); 104362306a36Sopenharmony_ci if (error) 104462306a36Sopenharmony_ci goto out_free; 104562306a36Sopenharmony_ci 104662306a36Sopenharmony_ci if (xfarray_sort_terminated(si, &error)) 104762306a36Sopenharmony_ci goto out_free; 104862306a36Sopenharmony_ci 104962306a36Sopenharmony_ci /* Copy that item (a[lo]) to a[hi]. */ 105062306a36Sopenharmony_ci if (lo < hi) { 105162306a36Sopenharmony_ci error = xfarray_sort_store(si, hi--, scratch); 105262306a36Sopenharmony_ci if (error) 105362306a36Sopenharmony_ci goto out_free; 105462306a36Sopenharmony_ci } 105562306a36Sopenharmony_ci 105662306a36Sopenharmony_ci if (xfarray_sort_terminated(si, &error)) 105762306a36Sopenharmony_ci goto out_free; 105862306a36Sopenharmony_ci } 105962306a36Sopenharmony_ci 106062306a36Sopenharmony_ci /* 106162306a36Sopenharmony_ci * Put our pivot value in the correct place at a[lo]. All 106262306a36Sopenharmony_ci * values between a[beg[i]] and a[lo - 1] should be less than 106362306a36Sopenharmony_ci * the pivot; and all values between a[lo + 1] and a[end[i]-1] 106462306a36Sopenharmony_ci * should be greater than the pivot. 106562306a36Sopenharmony_ci */ 106662306a36Sopenharmony_ci error = xfarray_sort_store(si, lo, pivot); 106762306a36Sopenharmony_ci if (error) 106862306a36Sopenharmony_ci goto out_free; 106962306a36Sopenharmony_ci 107062306a36Sopenharmony_ci /* Set up the stack frame to process the two partitions. */ 107162306a36Sopenharmony_ci error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi); 107262306a36Sopenharmony_ci if (error) 107362306a36Sopenharmony_ci goto out_free; 107462306a36Sopenharmony_ci 107562306a36Sopenharmony_ci if (xfarray_sort_terminated(si, &error)) 107662306a36Sopenharmony_ci goto out_free; 107762306a36Sopenharmony_ci } 107862306a36Sopenharmony_ci 107962306a36Sopenharmony_ciout_free: 108062306a36Sopenharmony_ci trace_xfarray_sort_stats(si, error); 108162306a36Sopenharmony_ci kvfree(si); 108262306a36Sopenharmony_ci return error; 108362306a36Sopenharmony_ci} 1084