18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * mm/readahead.c - address_space-level file readahead.
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2002, Linus Torvalds
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * 09Apr2002	Andrew Morton
88c2ecf20Sopenharmony_ci *		Initial version.
98c2ecf20Sopenharmony_ci */
108c2ecf20Sopenharmony_ci
118c2ecf20Sopenharmony_ci#include <linux/kernel.h>
128c2ecf20Sopenharmony_ci#include <linux/dax.h>
138c2ecf20Sopenharmony_ci#include <linux/gfp.h>
148c2ecf20Sopenharmony_ci#include <linux/export.h>
158c2ecf20Sopenharmony_ci#include <linux/blkdev.h>
168c2ecf20Sopenharmony_ci#include <linux/backing-dev.h>
178c2ecf20Sopenharmony_ci#include <linux/task_io_accounting_ops.h>
188c2ecf20Sopenharmony_ci#include <linux/pagevec.h>
198c2ecf20Sopenharmony_ci#include <linux/pagemap.h>
208c2ecf20Sopenharmony_ci#include <linux/syscalls.h>
218c2ecf20Sopenharmony_ci#include <linux/file.h>
228c2ecf20Sopenharmony_ci#include <linux/mm_inline.h>
238c2ecf20Sopenharmony_ci#include <linux/blk-cgroup.h>
248c2ecf20Sopenharmony_ci#include <linux/fadvise.h>
258c2ecf20Sopenharmony_ci#include <linux/sched/mm.h>
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci#include "internal.h"
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_ci/*
308c2ecf20Sopenharmony_ci * Initialise a struct file's readahead state.  Assumes that the caller has
318c2ecf20Sopenharmony_ci * memset *ra to zero.
328c2ecf20Sopenharmony_ci */
338c2ecf20Sopenharmony_civoid
348c2ecf20Sopenharmony_cifile_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
358c2ecf20Sopenharmony_ci{
368c2ecf20Sopenharmony_ci	ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
378c2ecf20Sopenharmony_ci	ra->prev_pos = -1;
388c2ecf20Sopenharmony_ci}
398c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(file_ra_state_init);
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_ci/*
428c2ecf20Sopenharmony_ci * see if a page needs releasing upon read_cache_pages() failure
438c2ecf20Sopenharmony_ci * - the caller of read_cache_pages() may have set PG_private or PG_fscache
448c2ecf20Sopenharmony_ci *   before calling, such as the NFS fs marking pages that are cached locally
458c2ecf20Sopenharmony_ci *   on disk, thus we need to give the fs a chance to clean up in the event of
468c2ecf20Sopenharmony_ci *   an error
478c2ecf20Sopenharmony_ci */
488c2ecf20Sopenharmony_cistatic void read_cache_pages_invalidate_page(struct address_space *mapping,
498c2ecf20Sopenharmony_ci					     struct page *page)
508c2ecf20Sopenharmony_ci{
518c2ecf20Sopenharmony_ci	if (page_has_private(page)) {
528c2ecf20Sopenharmony_ci		if (!trylock_page(page))
538c2ecf20Sopenharmony_ci			BUG();
548c2ecf20Sopenharmony_ci		page->mapping = mapping;
558c2ecf20Sopenharmony_ci		do_invalidatepage(page, 0, PAGE_SIZE);
568c2ecf20Sopenharmony_ci		page->mapping = NULL;
578c2ecf20Sopenharmony_ci		unlock_page(page);
588c2ecf20Sopenharmony_ci	}
598c2ecf20Sopenharmony_ci	put_page(page);
608c2ecf20Sopenharmony_ci}
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci/*
638c2ecf20Sopenharmony_ci * release a list of pages, invalidating them first if need be
648c2ecf20Sopenharmony_ci */
658c2ecf20Sopenharmony_cistatic void read_cache_pages_invalidate_pages(struct address_space *mapping,
668c2ecf20Sopenharmony_ci					      struct list_head *pages)
678c2ecf20Sopenharmony_ci{
688c2ecf20Sopenharmony_ci	struct page *victim;
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci	while (!list_empty(pages)) {
718c2ecf20Sopenharmony_ci		victim = lru_to_page(pages);
728c2ecf20Sopenharmony_ci		list_del(&victim->lru);
738c2ecf20Sopenharmony_ci		read_cache_pages_invalidate_page(mapping, victim);
748c2ecf20Sopenharmony_ci	}
758c2ecf20Sopenharmony_ci}
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci/**
788c2ecf20Sopenharmony_ci * read_cache_pages - populate an address space with some pages & start reads against them
798c2ecf20Sopenharmony_ci * @mapping: the address_space
808c2ecf20Sopenharmony_ci * @pages: The address of a list_head which contains the target pages.  These
818c2ecf20Sopenharmony_ci *   pages have their ->index populated and are otherwise uninitialised.
828c2ecf20Sopenharmony_ci * @filler: callback routine for filling a single page.
838c2ecf20Sopenharmony_ci * @data: private data for the callback routine.
848c2ecf20Sopenharmony_ci *
858c2ecf20Sopenharmony_ci * Hides the details of the LRU cache etc from the filesystems.
868c2ecf20Sopenharmony_ci *
878c2ecf20Sopenharmony_ci * Returns: %0 on success, error return by @filler otherwise
888c2ecf20Sopenharmony_ci */
898c2ecf20Sopenharmony_ciint read_cache_pages(struct address_space *mapping, struct list_head *pages,
908c2ecf20Sopenharmony_ci			int (*filler)(void *, struct page *), void *data)
918c2ecf20Sopenharmony_ci{
928c2ecf20Sopenharmony_ci	struct page *page;
938c2ecf20Sopenharmony_ci	int ret = 0;
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_ci	while (!list_empty(pages)) {
968c2ecf20Sopenharmony_ci		page = lru_to_page(pages);
978c2ecf20Sopenharmony_ci		list_del(&page->lru);
988c2ecf20Sopenharmony_ci		if (add_to_page_cache_lru(page, mapping, page->index,
998c2ecf20Sopenharmony_ci				readahead_gfp_mask(mapping))) {
1008c2ecf20Sopenharmony_ci			read_cache_pages_invalidate_page(mapping, page);
1018c2ecf20Sopenharmony_ci			continue;
1028c2ecf20Sopenharmony_ci		}
1038c2ecf20Sopenharmony_ci		put_page(page);
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci		ret = filler(data, page);
1068c2ecf20Sopenharmony_ci		if (unlikely(ret)) {
1078c2ecf20Sopenharmony_ci			read_cache_pages_invalidate_pages(mapping, pages);
1088c2ecf20Sopenharmony_ci			break;
1098c2ecf20Sopenharmony_ci		}
1108c2ecf20Sopenharmony_ci		task_io_account_read(PAGE_SIZE);
1118c2ecf20Sopenharmony_ci	}
1128c2ecf20Sopenharmony_ci	return ret;
1138c2ecf20Sopenharmony_ci}
1148c2ecf20Sopenharmony_ci
1158c2ecf20Sopenharmony_ciEXPORT_SYMBOL(read_cache_pages);
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_cistatic void read_pages(struct readahead_control *rac, struct list_head *pages,
1188c2ecf20Sopenharmony_ci		bool skip_page)
1198c2ecf20Sopenharmony_ci{
1208c2ecf20Sopenharmony_ci	const struct address_space_operations *aops = rac->mapping->a_ops;
1218c2ecf20Sopenharmony_ci	struct page *page;
1228c2ecf20Sopenharmony_ci	struct blk_plug plug;
1238c2ecf20Sopenharmony_ci
1248c2ecf20Sopenharmony_ci	if (!readahead_count(rac))
1258c2ecf20Sopenharmony_ci		goto out;
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci	blk_start_plug(&plug);
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_ci	if (aops->readahead) {
1308c2ecf20Sopenharmony_ci		aops->readahead(rac);
1318c2ecf20Sopenharmony_ci		/* Clean up the remaining pages */
1328c2ecf20Sopenharmony_ci		while ((page = readahead_page(rac))) {
1338c2ecf20Sopenharmony_ci			unlock_page(page);
1348c2ecf20Sopenharmony_ci			put_page(page);
1358c2ecf20Sopenharmony_ci		}
1368c2ecf20Sopenharmony_ci	} else if (aops->readpages) {
1378c2ecf20Sopenharmony_ci		aops->readpages(rac->file, rac->mapping, pages,
1388c2ecf20Sopenharmony_ci				readahead_count(rac));
1398c2ecf20Sopenharmony_ci		/* Clean up the remaining pages */
1408c2ecf20Sopenharmony_ci		put_pages_list(pages);
1418c2ecf20Sopenharmony_ci		rac->_index += rac->_nr_pages;
1428c2ecf20Sopenharmony_ci		rac->_nr_pages = 0;
1438c2ecf20Sopenharmony_ci	} else {
1448c2ecf20Sopenharmony_ci		while ((page = readahead_page(rac))) {
1458c2ecf20Sopenharmony_ci			aops->readpage(rac->file, page);
1468c2ecf20Sopenharmony_ci			put_page(page);
1478c2ecf20Sopenharmony_ci		}
1488c2ecf20Sopenharmony_ci	}
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci	blk_finish_plug(&plug);
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci	BUG_ON(!list_empty(pages));
1538c2ecf20Sopenharmony_ci	BUG_ON(readahead_count(rac));
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ciout:
1568c2ecf20Sopenharmony_ci	if (skip_page)
1578c2ecf20Sopenharmony_ci		rac->_index++;
1588c2ecf20Sopenharmony_ci}
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_ci/**
1618c2ecf20Sopenharmony_ci * page_cache_ra_unbounded - Start unchecked readahead.
1628c2ecf20Sopenharmony_ci * @ractl: Readahead control.
1638c2ecf20Sopenharmony_ci * @nr_to_read: The number of pages to read.
1648c2ecf20Sopenharmony_ci * @lookahead_size: Where to start the next readahead.
1658c2ecf20Sopenharmony_ci *
1668c2ecf20Sopenharmony_ci * This function is for filesystems to call when they want to start
1678c2ecf20Sopenharmony_ci * readahead beyond a file's stated i_size.  This is almost certainly
1688c2ecf20Sopenharmony_ci * not the function you want to call.  Use page_cache_async_readahead()
1698c2ecf20Sopenharmony_ci * or page_cache_sync_readahead() instead.
1708c2ecf20Sopenharmony_ci *
1718c2ecf20Sopenharmony_ci * Context: File is referenced by caller.  Mutexes may be held by caller.
1728c2ecf20Sopenharmony_ci * May sleep, but will not reenter filesystem to reclaim memory.
1738c2ecf20Sopenharmony_ci */
1748c2ecf20Sopenharmony_civoid page_cache_ra_unbounded(struct readahead_control *ractl,
1758c2ecf20Sopenharmony_ci		unsigned long nr_to_read, unsigned long lookahead_size)
1768c2ecf20Sopenharmony_ci{
1778c2ecf20Sopenharmony_ci	struct address_space *mapping = ractl->mapping;
1788c2ecf20Sopenharmony_ci	unsigned long index = readahead_index(ractl);
1798c2ecf20Sopenharmony_ci	LIST_HEAD(page_pool);
1808c2ecf20Sopenharmony_ci	gfp_t gfp_mask = readahead_gfp_mask(mapping);
1818c2ecf20Sopenharmony_ci	unsigned long i;
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci	/*
1848c2ecf20Sopenharmony_ci	 * Partway through the readahead operation, we will have added
1858c2ecf20Sopenharmony_ci	 * locked pages to the page cache, but will not yet have submitted
1868c2ecf20Sopenharmony_ci	 * them for I/O.  Adding another page may need to allocate memory,
1878c2ecf20Sopenharmony_ci	 * which can trigger memory reclaim.  Telling the VM we're in
1888c2ecf20Sopenharmony_ci	 * the middle of a filesystem operation will cause it to not
1898c2ecf20Sopenharmony_ci	 * touch file-backed pages, preventing a deadlock.  Most (all?)
1908c2ecf20Sopenharmony_ci	 * filesystems already specify __GFP_NOFS in their mapping's
1918c2ecf20Sopenharmony_ci	 * gfp_mask, but let's be explicit here.
1928c2ecf20Sopenharmony_ci	 */
1938c2ecf20Sopenharmony_ci	unsigned int nofs = memalloc_nofs_save();
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_ci	/*
1968c2ecf20Sopenharmony_ci	 * Preallocate as many pages as we will need.
1978c2ecf20Sopenharmony_ci	 */
1988c2ecf20Sopenharmony_ci	for (i = 0; i < nr_to_read; i++) {
1998c2ecf20Sopenharmony_ci		struct page *page = xa_load(&mapping->i_pages, index + i);
2008c2ecf20Sopenharmony_ci
2018c2ecf20Sopenharmony_ci		BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci		if (page && !xa_is_value(page)) {
2048c2ecf20Sopenharmony_ci			/*
2058c2ecf20Sopenharmony_ci			 * Page already present?  Kick off the current batch
2068c2ecf20Sopenharmony_ci			 * of contiguous pages before continuing with the
2078c2ecf20Sopenharmony_ci			 * next batch.  This page may be the one we would
2088c2ecf20Sopenharmony_ci			 * have intended to mark as Readahead, but we don't
2098c2ecf20Sopenharmony_ci			 * have a stable reference to this page, and it's
2108c2ecf20Sopenharmony_ci			 * not worth getting one just for that.
2118c2ecf20Sopenharmony_ci			 */
2128c2ecf20Sopenharmony_ci			read_pages(ractl, &page_pool, true);
2138c2ecf20Sopenharmony_ci			continue;
2148c2ecf20Sopenharmony_ci		}
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci		page = __page_cache_alloc(gfp_mask);
2178c2ecf20Sopenharmony_ci		if (!page)
2188c2ecf20Sopenharmony_ci			break;
2198c2ecf20Sopenharmony_ci		if (mapping->a_ops->readpages) {
2208c2ecf20Sopenharmony_ci			page->index = index + i;
2218c2ecf20Sopenharmony_ci			list_add(&page->lru, &page_pool);
2228c2ecf20Sopenharmony_ci		} else if (add_to_page_cache_lru(page, mapping, index + i,
2238c2ecf20Sopenharmony_ci					gfp_mask) < 0) {
2248c2ecf20Sopenharmony_ci			put_page(page);
2258c2ecf20Sopenharmony_ci			read_pages(ractl, &page_pool, true);
2268c2ecf20Sopenharmony_ci			continue;
2278c2ecf20Sopenharmony_ci		}
2288c2ecf20Sopenharmony_ci		if (i == nr_to_read - lookahead_size)
2298c2ecf20Sopenharmony_ci			SetPageReadahead(page);
2308c2ecf20Sopenharmony_ci		ractl->_nr_pages++;
2318c2ecf20Sopenharmony_ci	}
2328c2ecf20Sopenharmony_ci
2338c2ecf20Sopenharmony_ci	/*
2348c2ecf20Sopenharmony_ci	 * Now start the IO.  We ignore I/O errors - if the page is not
2358c2ecf20Sopenharmony_ci	 * uptodate then the caller will launch readpage again, and
2368c2ecf20Sopenharmony_ci	 * will then handle the error.
2378c2ecf20Sopenharmony_ci	 */
2388c2ecf20Sopenharmony_ci	read_pages(ractl, &page_pool, false);
2398c2ecf20Sopenharmony_ci	memalloc_nofs_restore(nofs);
2408c2ecf20Sopenharmony_ci}
2418c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci/*
2448c2ecf20Sopenharmony_ci * do_page_cache_ra() actually reads a chunk of disk.  It allocates
2458c2ecf20Sopenharmony_ci * the pages first, then submits them for I/O. This avoids the very bad
2468c2ecf20Sopenharmony_ci * behaviour which would occur if page allocations are causing VM writeback.
2478c2ecf20Sopenharmony_ci * We really don't want to intermingle reads and writes like that.
2488c2ecf20Sopenharmony_ci */
2498c2ecf20Sopenharmony_civoid do_page_cache_ra(struct readahead_control *ractl,
2508c2ecf20Sopenharmony_ci		unsigned long nr_to_read, unsigned long lookahead_size)
2518c2ecf20Sopenharmony_ci{
2528c2ecf20Sopenharmony_ci	struct inode *inode = ractl->mapping->host;
2538c2ecf20Sopenharmony_ci	unsigned long index = readahead_index(ractl);
2548c2ecf20Sopenharmony_ci	loff_t isize = i_size_read(inode);
2558c2ecf20Sopenharmony_ci	pgoff_t end_index;	/* The last page we want to read */
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci	if (isize == 0)
2588c2ecf20Sopenharmony_ci		return;
2598c2ecf20Sopenharmony_ci
2608c2ecf20Sopenharmony_ci	end_index = (isize - 1) >> PAGE_SHIFT;
2618c2ecf20Sopenharmony_ci	if (index > end_index)
2628c2ecf20Sopenharmony_ci		return;
2638c2ecf20Sopenharmony_ci	/* Don't read past the page containing the last byte of the file */
2648c2ecf20Sopenharmony_ci	if (nr_to_read > end_index - index)
2658c2ecf20Sopenharmony_ci		nr_to_read = end_index - index + 1;
2668c2ecf20Sopenharmony_ci
2678c2ecf20Sopenharmony_ci	page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
2688c2ecf20Sopenharmony_ci}
2698c2ecf20Sopenharmony_ci
2708c2ecf20Sopenharmony_ci/*
2718c2ecf20Sopenharmony_ci * Chunk the readahead into 2 megabyte units, so that we don't pin too much
2728c2ecf20Sopenharmony_ci * memory at once.
2738c2ecf20Sopenharmony_ci */
2748c2ecf20Sopenharmony_civoid force_page_cache_ra(struct readahead_control *ractl,
2758c2ecf20Sopenharmony_ci		struct file_ra_state *ra, unsigned long nr_to_read)
2768c2ecf20Sopenharmony_ci{
2778c2ecf20Sopenharmony_ci	struct address_space *mapping = ractl->mapping;
2788c2ecf20Sopenharmony_ci	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
2798c2ecf20Sopenharmony_ci	unsigned long max_pages, index;
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_ci	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
2828c2ecf20Sopenharmony_ci			!mapping->a_ops->readahead))
2838c2ecf20Sopenharmony_ci		return;
2848c2ecf20Sopenharmony_ci
2858c2ecf20Sopenharmony_ci	/*
2868c2ecf20Sopenharmony_ci	 * If the request exceeds the readahead window, allow the read to
2878c2ecf20Sopenharmony_ci	 * be up to the optimal hardware IO size
2888c2ecf20Sopenharmony_ci	 */
2898c2ecf20Sopenharmony_ci	index = readahead_index(ractl);
2908c2ecf20Sopenharmony_ci	max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
2918c2ecf20Sopenharmony_ci	nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
2928c2ecf20Sopenharmony_ci	while (nr_to_read) {
2938c2ecf20Sopenharmony_ci		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
2948c2ecf20Sopenharmony_ci
2958c2ecf20Sopenharmony_ci		if (this_chunk > nr_to_read)
2968c2ecf20Sopenharmony_ci			this_chunk = nr_to_read;
2978c2ecf20Sopenharmony_ci		ractl->_index = index;
2988c2ecf20Sopenharmony_ci		do_page_cache_ra(ractl, this_chunk, 0);
2998c2ecf20Sopenharmony_ci
3008c2ecf20Sopenharmony_ci		index += this_chunk;
3018c2ecf20Sopenharmony_ci		nr_to_read -= this_chunk;
3028c2ecf20Sopenharmony_ci	}
3038c2ecf20Sopenharmony_ci}
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_ci/*
3068c2ecf20Sopenharmony_ci * Set the initial window size, round to next power of 2 and square
3078c2ecf20Sopenharmony_ci * for small size, x 4 for medium, and x 2 for large
3088c2ecf20Sopenharmony_ci * for 128k (32 page) max ra
3098c2ecf20Sopenharmony_ci * 1-8 page = 32k initial, > 8 page = 128k initial
3108c2ecf20Sopenharmony_ci */
3118c2ecf20Sopenharmony_cistatic unsigned long get_init_ra_size(unsigned long size, unsigned long max)
3128c2ecf20Sopenharmony_ci{
3138c2ecf20Sopenharmony_ci	unsigned long newsize = roundup_pow_of_two(size);
3148c2ecf20Sopenharmony_ci
3158c2ecf20Sopenharmony_ci	if (newsize <= max / 32)
3168c2ecf20Sopenharmony_ci		newsize = newsize * 4;
3178c2ecf20Sopenharmony_ci	else if (newsize <= max / 4)
3188c2ecf20Sopenharmony_ci		newsize = newsize * 2;
3198c2ecf20Sopenharmony_ci	else
3208c2ecf20Sopenharmony_ci		newsize = max;
3218c2ecf20Sopenharmony_ci
3228c2ecf20Sopenharmony_ci	return newsize;
3238c2ecf20Sopenharmony_ci}
3248c2ecf20Sopenharmony_ci
3258c2ecf20Sopenharmony_ci/*
3268c2ecf20Sopenharmony_ci *  Get the previous window size, ramp it up, and
3278c2ecf20Sopenharmony_ci *  return it as the new window size.
3288c2ecf20Sopenharmony_ci */
3298c2ecf20Sopenharmony_cistatic unsigned long get_next_ra_size(struct file_ra_state *ra,
3308c2ecf20Sopenharmony_ci				      unsigned long max)
3318c2ecf20Sopenharmony_ci{
3328c2ecf20Sopenharmony_ci	unsigned long cur = ra->size;
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ci	if (cur < max / 16)
3358c2ecf20Sopenharmony_ci		return 4 * cur;
3368c2ecf20Sopenharmony_ci	if (cur <= max / 2)
3378c2ecf20Sopenharmony_ci		return 2 * cur;
3388c2ecf20Sopenharmony_ci	return max;
3398c2ecf20Sopenharmony_ci}
3408c2ecf20Sopenharmony_ci
3418c2ecf20Sopenharmony_ci/*
3428c2ecf20Sopenharmony_ci * On-demand readahead design.
3438c2ecf20Sopenharmony_ci *
3448c2ecf20Sopenharmony_ci * The fields in struct file_ra_state represent the most-recently-executed
3458c2ecf20Sopenharmony_ci * readahead attempt:
3468c2ecf20Sopenharmony_ci *
3478c2ecf20Sopenharmony_ci *                        |<----- async_size ---------|
3488c2ecf20Sopenharmony_ci *     |------------------- size -------------------->|
3498c2ecf20Sopenharmony_ci *     |==================#===========================|
3508c2ecf20Sopenharmony_ci *     ^start             ^page marked with PG_readahead
3518c2ecf20Sopenharmony_ci *
3528c2ecf20Sopenharmony_ci * To overlap application thinking time and disk I/O time, we do
3538c2ecf20Sopenharmony_ci * `readahead pipelining': Do not wait until the application consumed all
3548c2ecf20Sopenharmony_ci * readahead pages and stalled on the missing page at readahead_index;
3558c2ecf20Sopenharmony_ci * Instead, submit an asynchronous readahead I/O as soon as there are
3568c2ecf20Sopenharmony_ci * only async_size pages left in the readahead window. Normally async_size
3578c2ecf20Sopenharmony_ci * will be equal to size, for maximum pipelining.
3588c2ecf20Sopenharmony_ci *
3598c2ecf20Sopenharmony_ci * In interleaved sequential reads, concurrent streams on the same fd can
3608c2ecf20Sopenharmony_ci * be invalidating each other's readahead state. So we flag the new readahead
3618c2ecf20Sopenharmony_ci * page at (start+size-async_size) with PG_readahead, and use it as readahead
3628c2ecf20Sopenharmony_ci * indicator. The flag won't be set on already cached pages, to avoid the
3638c2ecf20Sopenharmony_ci * readahead-for-nothing fuss, saving pointless page cache lookups.
3648c2ecf20Sopenharmony_ci *
3658c2ecf20Sopenharmony_ci * prev_pos tracks the last visited byte in the _previous_ read request.
3668c2ecf20Sopenharmony_ci * It should be maintained by the caller, and will be used for detecting
3678c2ecf20Sopenharmony_ci * small random reads. Note that the readahead algorithm checks loosely
3688c2ecf20Sopenharmony_ci * for sequential patterns. Hence interleaved reads might be served as
3698c2ecf20Sopenharmony_ci * sequential ones.
3708c2ecf20Sopenharmony_ci *
3718c2ecf20Sopenharmony_ci * There is a special-case: if the first page which the application tries to
3728c2ecf20Sopenharmony_ci * read happens to be the first page of the file, it is assumed that a linear
3738c2ecf20Sopenharmony_ci * read is about to happen and the window is immediately set to the initial size
3748c2ecf20Sopenharmony_ci * based on I/O request size and the max_readahead.
3758c2ecf20Sopenharmony_ci *
3768c2ecf20Sopenharmony_ci * The code ramps up the readahead size aggressively at first, but slow down as
3778c2ecf20Sopenharmony_ci * it approaches max_readhead.
3788c2ecf20Sopenharmony_ci */
3798c2ecf20Sopenharmony_ci
3808c2ecf20Sopenharmony_ci/*
3818c2ecf20Sopenharmony_ci * Count contiguously cached pages from @index-1 to @index-@max,
3828c2ecf20Sopenharmony_ci * this count is a conservative estimation of
3838c2ecf20Sopenharmony_ci * 	- length of the sequential read sequence, or
3848c2ecf20Sopenharmony_ci * 	- thrashing threshold in memory tight systems
3858c2ecf20Sopenharmony_ci */
3868c2ecf20Sopenharmony_cistatic pgoff_t count_history_pages(struct address_space *mapping,
3878c2ecf20Sopenharmony_ci				   pgoff_t index, unsigned long max)
3888c2ecf20Sopenharmony_ci{
3898c2ecf20Sopenharmony_ci	pgoff_t head;
3908c2ecf20Sopenharmony_ci
3918c2ecf20Sopenharmony_ci	rcu_read_lock();
3928c2ecf20Sopenharmony_ci	head = page_cache_prev_miss(mapping, index - 1, max);
3938c2ecf20Sopenharmony_ci	rcu_read_unlock();
3948c2ecf20Sopenharmony_ci
3958c2ecf20Sopenharmony_ci	return index - 1 - head;
3968c2ecf20Sopenharmony_ci}
3978c2ecf20Sopenharmony_ci
3988c2ecf20Sopenharmony_ci/*
3998c2ecf20Sopenharmony_ci * page cache context based read-ahead
4008c2ecf20Sopenharmony_ci */
4018c2ecf20Sopenharmony_cistatic int try_context_readahead(struct address_space *mapping,
4028c2ecf20Sopenharmony_ci				 struct file_ra_state *ra,
4038c2ecf20Sopenharmony_ci				 pgoff_t index,
4048c2ecf20Sopenharmony_ci				 unsigned long req_size,
4058c2ecf20Sopenharmony_ci				 unsigned long max)
4068c2ecf20Sopenharmony_ci{
4078c2ecf20Sopenharmony_ci	pgoff_t size;
4088c2ecf20Sopenharmony_ci
4098c2ecf20Sopenharmony_ci	size = count_history_pages(mapping, index, max);
4108c2ecf20Sopenharmony_ci
4118c2ecf20Sopenharmony_ci	/*
4128c2ecf20Sopenharmony_ci	 * not enough history pages:
4138c2ecf20Sopenharmony_ci	 * it could be a random read
4148c2ecf20Sopenharmony_ci	 */
4158c2ecf20Sopenharmony_ci	if (size <= req_size)
4168c2ecf20Sopenharmony_ci		return 0;
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_ci	/*
4198c2ecf20Sopenharmony_ci	 * starts from beginning of file:
4208c2ecf20Sopenharmony_ci	 * it is a strong indication of long-run stream (or whole-file-read)
4218c2ecf20Sopenharmony_ci	 */
4228c2ecf20Sopenharmony_ci	if (size >= index)
4238c2ecf20Sopenharmony_ci		size *= 2;
4248c2ecf20Sopenharmony_ci
4258c2ecf20Sopenharmony_ci	ra->start = index;
4268c2ecf20Sopenharmony_ci	ra->size = min(size + req_size, max);
4278c2ecf20Sopenharmony_ci	ra->async_size = 1;
4288c2ecf20Sopenharmony_ci
4298c2ecf20Sopenharmony_ci	return 1;
4308c2ecf20Sopenharmony_ci}
4318c2ecf20Sopenharmony_ci
4328c2ecf20Sopenharmony_ci/*
4338c2ecf20Sopenharmony_ci * A minimal readahead algorithm for trivial sequential/random reads.
4348c2ecf20Sopenharmony_ci */
4358c2ecf20Sopenharmony_cistatic void ondemand_readahead(struct readahead_control *ractl,
4368c2ecf20Sopenharmony_ci		struct file_ra_state *ra, bool hit_readahead_marker,
4378c2ecf20Sopenharmony_ci		unsigned long req_size)
4388c2ecf20Sopenharmony_ci{
4398c2ecf20Sopenharmony_ci	struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
4408c2ecf20Sopenharmony_ci	unsigned long max_pages = ra->ra_pages;
4418c2ecf20Sopenharmony_ci	unsigned long add_pages;
4428c2ecf20Sopenharmony_ci	unsigned long index = readahead_index(ractl);
4438c2ecf20Sopenharmony_ci	pgoff_t prev_index;
4448c2ecf20Sopenharmony_ci
4458c2ecf20Sopenharmony_ci	/*
4468c2ecf20Sopenharmony_ci	 * If the request exceeds the readahead window, allow the read to
4478c2ecf20Sopenharmony_ci	 * be up to the optimal hardware IO size
4488c2ecf20Sopenharmony_ci	 */
4498c2ecf20Sopenharmony_ci	if (req_size > max_pages && bdi->io_pages > max_pages)
4508c2ecf20Sopenharmony_ci		max_pages = min(req_size, bdi->io_pages);
4518c2ecf20Sopenharmony_ci
4528c2ecf20Sopenharmony_ci	/*
4538c2ecf20Sopenharmony_ci	 * start of file
4548c2ecf20Sopenharmony_ci	 */
4558c2ecf20Sopenharmony_ci	if (!index)
4568c2ecf20Sopenharmony_ci		goto initial_readahead;
4578c2ecf20Sopenharmony_ci
4588c2ecf20Sopenharmony_ci	/*
4598c2ecf20Sopenharmony_ci	 * It's the expected callback index, assume sequential access.
4608c2ecf20Sopenharmony_ci	 * Ramp up sizes, and push forward the readahead window.
4618c2ecf20Sopenharmony_ci	 */
4628c2ecf20Sopenharmony_ci	if ((index == (ra->start + ra->size - ra->async_size) ||
4638c2ecf20Sopenharmony_ci	     index == (ra->start + ra->size))) {
4648c2ecf20Sopenharmony_ci		ra->start += ra->size;
4658c2ecf20Sopenharmony_ci		ra->size = get_next_ra_size(ra, max_pages);
4668c2ecf20Sopenharmony_ci		ra->async_size = ra->size;
4678c2ecf20Sopenharmony_ci		goto readit;
4688c2ecf20Sopenharmony_ci	}
4698c2ecf20Sopenharmony_ci
4708c2ecf20Sopenharmony_ci	/*
4718c2ecf20Sopenharmony_ci	 * Hit a marked page without valid readahead state.
4728c2ecf20Sopenharmony_ci	 * E.g. interleaved reads.
4738c2ecf20Sopenharmony_ci	 * Query the pagecache for async_size, which normally equals to
4748c2ecf20Sopenharmony_ci	 * readahead size. Ramp it up and use it as the new readahead size.
4758c2ecf20Sopenharmony_ci	 */
4768c2ecf20Sopenharmony_ci	if (hit_readahead_marker) {
4778c2ecf20Sopenharmony_ci		pgoff_t start;
4788c2ecf20Sopenharmony_ci
4798c2ecf20Sopenharmony_ci		rcu_read_lock();
4808c2ecf20Sopenharmony_ci		start = page_cache_next_miss(ractl->mapping, index + 1,
4818c2ecf20Sopenharmony_ci				max_pages);
4828c2ecf20Sopenharmony_ci		rcu_read_unlock();
4838c2ecf20Sopenharmony_ci
4848c2ecf20Sopenharmony_ci		if (!start || start - index > max_pages)
4858c2ecf20Sopenharmony_ci			return;
4868c2ecf20Sopenharmony_ci
4878c2ecf20Sopenharmony_ci		ra->start = start;
4888c2ecf20Sopenharmony_ci		ra->size = start - index;	/* old async_size */
4898c2ecf20Sopenharmony_ci		ra->size += req_size;
4908c2ecf20Sopenharmony_ci		ra->size = get_next_ra_size(ra, max_pages);
4918c2ecf20Sopenharmony_ci		ra->async_size = ra->size;
4928c2ecf20Sopenharmony_ci		goto readit;
4938c2ecf20Sopenharmony_ci	}
4948c2ecf20Sopenharmony_ci
4958c2ecf20Sopenharmony_ci	/*
4968c2ecf20Sopenharmony_ci	 * oversize read
4978c2ecf20Sopenharmony_ci	 */
4988c2ecf20Sopenharmony_ci	if (req_size > max_pages)
4998c2ecf20Sopenharmony_ci		goto initial_readahead;
5008c2ecf20Sopenharmony_ci
5018c2ecf20Sopenharmony_ci	/*
5028c2ecf20Sopenharmony_ci	 * sequential cache miss
5038c2ecf20Sopenharmony_ci	 * trivial case: (index - prev_index) == 1
5048c2ecf20Sopenharmony_ci	 * unaligned reads: (index - prev_index) == 0
5058c2ecf20Sopenharmony_ci	 */
5068c2ecf20Sopenharmony_ci	prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
5078c2ecf20Sopenharmony_ci	if (index - prev_index <= 1UL)
5088c2ecf20Sopenharmony_ci		goto initial_readahead;
5098c2ecf20Sopenharmony_ci
5108c2ecf20Sopenharmony_ci	/*
5118c2ecf20Sopenharmony_ci	 * Query the page cache and look for the traces(cached history pages)
5128c2ecf20Sopenharmony_ci	 * that a sequential stream would leave behind.
5138c2ecf20Sopenharmony_ci	 */
5148c2ecf20Sopenharmony_ci	if (try_context_readahead(ractl->mapping, ra, index, req_size,
5158c2ecf20Sopenharmony_ci			max_pages))
5168c2ecf20Sopenharmony_ci		goto readit;
5178c2ecf20Sopenharmony_ci
5188c2ecf20Sopenharmony_ci	/*
5198c2ecf20Sopenharmony_ci	 * standalone, small random read
5208c2ecf20Sopenharmony_ci	 * Read as is, and do not pollute the readahead state.
5218c2ecf20Sopenharmony_ci	 */
5228c2ecf20Sopenharmony_ci	do_page_cache_ra(ractl, req_size, 0);
5238c2ecf20Sopenharmony_ci	return;
5248c2ecf20Sopenharmony_ci
5258c2ecf20Sopenharmony_ciinitial_readahead:
5268c2ecf20Sopenharmony_ci	ra->start = index;
5278c2ecf20Sopenharmony_ci	ra->size = get_init_ra_size(req_size, max_pages);
5288c2ecf20Sopenharmony_ci	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
5298c2ecf20Sopenharmony_ci
5308c2ecf20Sopenharmony_cireadit:
5318c2ecf20Sopenharmony_ci	/*
5328c2ecf20Sopenharmony_ci	 * Will this read hit the readahead marker made by itself?
5338c2ecf20Sopenharmony_ci	 * If so, trigger the readahead marker hit now, and merge
5348c2ecf20Sopenharmony_ci	 * the resulted next readahead window into the current one.
5358c2ecf20Sopenharmony_ci	 * Take care of maximum IO pages as above.
5368c2ecf20Sopenharmony_ci	 */
5378c2ecf20Sopenharmony_ci	if (index == ra->start && ra->size == ra->async_size) {
5388c2ecf20Sopenharmony_ci		add_pages = get_next_ra_size(ra, max_pages);
5398c2ecf20Sopenharmony_ci		if (ra->size + add_pages <= max_pages) {
5408c2ecf20Sopenharmony_ci			ra->async_size = add_pages;
5418c2ecf20Sopenharmony_ci			ra->size += add_pages;
5428c2ecf20Sopenharmony_ci		} else {
5438c2ecf20Sopenharmony_ci			ra->size = max_pages;
5448c2ecf20Sopenharmony_ci			ra->async_size = max_pages >> 1;
5458c2ecf20Sopenharmony_ci		}
5468c2ecf20Sopenharmony_ci	}
5478c2ecf20Sopenharmony_ci
5488c2ecf20Sopenharmony_ci	ractl->_index = ra->start;
5498c2ecf20Sopenharmony_ci	do_page_cache_ra(ractl, ra->size, ra->async_size);
5508c2ecf20Sopenharmony_ci}
5518c2ecf20Sopenharmony_ci
5528c2ecf20Sopenharmony_civoid page_cache_sync_ra(struct readahead_control *ractl,
5538c2ecf20Sopenharmony_ci		struct file_ra_state *ra, unsigned long req_count)
5548c2ecf20Sopenharmony_ci{
5558c2ecf20Sopenharmony_ci	bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
5568c2ecf20Sopenharmony_ci
5578c2ecf20Sopenharmony_ci	/*
5588c2ecf20Sopenharmony_ci	 * Even if read-ahead is disabled, issue this request as read-ahead
5598c2ecf20Sopenharmony_ci	 * as we'll need it to satisfy the requested range. The forced
5608c2ecf20Sopenharmony_ci	 * read-ahead will do the right thing and limit the read to just the
5618c2ecf20Sopenharmony_ci	 * requested range, which we'll set to 1 page for this case.
5628c2ecf20Sopenharmony_ci	 */
5638c2ecf20Sopenharmony_ci	if (!ra->ra_pages || blk_cgroup_congested()) {
5648c2ecf20Sopenharmony_ci		if (!ractl->file)
5658c2ecf20Sopenharmony_ci			return;
5668c2ecf20Sopenharmony_ci		req_count = 1;
5678c2ecf20Sopenharmony_ci		do_forced_ra = true;
5688c2ecf20Sopenharmony_ci	}
5698c2ecf20Sopenharmony_ci
5708c2ecf20Sopenharmony_ci	/* be dumb */
5718c2ecf20Sopenharmony_ci	if (do_forced_ra) {
5728c2ecf20Sopenharmony_ci		force_page_cache_ra(ractl, ra, req_count);
5738c2ecf20Sopenharmony_ci		return;
5748c2ecf20Sopenharmony_ci	}
5758c2ecf20Sopenharmony_ci
5768c2ecf20Sopenharmony_ci	/* do read-ahead */
5778c2ecf20Sopenharmony_ci	ondemand_readahead(ractl, ra, false, req_count);
5788c2ecf20Sopenharmony_ci}
5798c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(page_cache_sync_ra);
5808c2ecf20Sopenharmony_ci
5818c2ecf20Sopenharmony_civoid page_cache_async_ra(struct readahead_control *ractl,
5828c2ecf20Sopenharmony_ci		struct file_ra_state *ra, struct page *page,
5838c2ecf20Sopenharmony_ci		unsigned long req_count)
5848c2ecf20Sopenharmony_ci{
5858c2ecf20Sopenharmony_ci	/* no read-ahead */
5868c2ecf20Sopenharmony_ci	if (!ra->ra_pages)
5878c2ecf20Sopenharmony_ci		return;
5888c2ecf20Sopenharmony_ci
5898c2ecf20Sopenharmony_ci	/*
5908c2ecf20Sopenharmony_ci	 * Same bit is used for PG_readahead and PG_reclaim.
5918c2ecf20Sopenharmony_ci	 */
5928c2ecf20Sopenharmony_ci	if (PageWriteback(page))
5938c2ecf20Sopenharmony_ci		return;
5948c2ecf20Sopenharmony_ci
5958c2ecf20Sopenharmony_ci	ClearPageReadahead(page);
5968c2ecf20Sopenharmony_ci
5978c2ecf20Sopenharmony_ci	/*
5988c2ecf20Sopenharmony_ci	 * Defer asynchronous read-ahead on IO congestion.
5998c2ecf20Sopenharmony_ci	 */
6008c2ecf20Sopenharmony_ci	if (inode_read_congested(ractl->mapping->host))
6018c2ecf20Sopenharmony_ci		return;
6028c2ecf20Sopenharmony_ci
6038c2ecf20Sopenharmony_ci	if (blk_cgroup_congested())
6048c2ecf20Sopenharmony_ci		return;
6058c2ecf20Sopenharmony_ci
6068c2ecf20Sopenharmony_ci	/* do read-ahead */
6078c2ecf20Sopenharmony_ci	ondemand_readahead(ractl, ra, true, req_count);
6088c2ecf20Sopenharmony_ci}
6098c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(page_cache_async_ra);
6108c2ecf20Sopenharmony_ci
6118c2ecf20Sopenharmony_cissize_t ksys_readahead(int fd, loff_t offset, size_t count)
6128c2ecf20Sopenharmony_ci{
6138c2ecf20Sopenharmony_ci	ssize_t ret;
6148c2ecf20Sopenharmony_ci	struct fd f;
6158c2ecf20Sopenharmony_ci
6168c2ecf20Sopenharmony_ci	ret = -EBADF;
6178c2ecf20Sopenharmony_ci	f = fdget(fd);
6188c2ecf20Sopenharmony_ci	if (!f.file || !(f.file->f_mode & FMODE_READ))
6198c2ecf20Sopenharmony_ci		goto out;
6208c2ecf20Sopenharmony_ci
6218c2ecf20Sopenharmony_ci	/*
6228c2ecf20Sopenharmony_ci	 * The readahead() syscall is intended to run only on files
6238c2ecf20Sopenharmony_ci	 * that can execute readahead. If readahead is not possible
6248c2ecf20Sopenharmony_ci	 * on this file, then we must return -EINVAL.
6258c2ecf20Sopenharmony_ci	 */
6268c2ecf20Sopenharmony_ci	ret = -EINVAL;
6278c2ecf20Sopenharmony_ci	if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
6288c2ecf20Sopenharmony_ci	    (!S_ISREG(file_inode(f.file)->i_mode) &&
6298c2ecf20Sopenharmony_ci	    !S_ISBLK(file_inode(f.file)->i_mode)))
6308c2ecf20Sopenharmony_ci		goto out;
6318c2ecf20Sopenharmony_ci
6328c2ecf20Sopenharmony_ci	ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
6338c2ecf20Sopenharmony_ciout:
6348c2ecf20Sopenharmony_ci	fdput(f);
6358c2ecf20Sopenharmony_ci	return ret;
6368c2ecf20Sopenharmony_ci}
6378c2ecf20Sopenharmony_ci
6388c2ecf20Sopenharmony_ciSYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
6398c2ecf20Sopenharmony_ci{
6408c2ecf20Sopenharmony_ci	return ksys_readahead(fd, offset, count);
6418c2ecf20Sopenharmony_ci}
642