18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * mm/readahead.c - address_space-level file readahead. 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2002, Linus Torvalds 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * 09Apr2002 Andrew Morton 88c2ecf20Sopenharmony_ci * Initial version. 98c2ecf20Sopenharmony_ci */ 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci#include <linux/kernel.h> 128c2ecf20Sopenharmony_ci#include <linux/dax.h> 138c2ecf20Sopenharmony_ci#include <linux/gfp.h> 148c2ecf20Sopenharmony_ci#include <linux/export.h> 158c2ecf20Sopenharmony_ci#include <linux/blkdev.h> 168c2ecf20Sopenharmony_ci#include <linux/backing-dev.h> 178c2ecf20Sopenharmony_ci#include <linux/task_io_accounting_ops.h> 188c2ecf20Sopenharmony_ci#include <linux/pagevec.h> 198c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 208c2ecf20Sopenharmony_ci#include <linux/syscalls.h> 218c2ecf20Sopenharmony_ci#include <linux/file.h> 228c2ecf20Sopenharmony_ci#include <linux/mm_inline.h> 238c2ecf20Sopenharmony_ci#include <linux/blk-cgroup.h> 248c2ecf20Sopenharmony_ci#include <linux/fadvise.h> 258c2ecf20Sopenharmony_ci#include <linux/sched/mm.h> 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci#include "internal.h" 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci/* 308c2ecf20Sopenharmony_ci * Initialise a struct file's readahead state. Assumes that the caller has 318c2ecf20Sopenharmony_ci * memset *ra to zero. 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_civoid 348c2ecf20Sopenharmony_cifile_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) 358c2ecf20Sopenharmony_ci{ 368c2ecf20Sopenharmony_ci ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; 378c2ecf20Sopenharmony_ci ra->prev_pos = -1; 388c2ecf20Sopenharmony_ci} 398c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(file_ra_state_init); 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci/* 428c2ecf20Sopenharmony_ci * see if a page needs releasing upon read_cache_pages() failure 438c2ecf20Sopenharmony_ci * - the caller of read_cache_pages() may have set PG_private or PG_fscache 448c2ecf20Sopenharmony_ci * before calling, such as the NFS fs marking pages that are cached locally 458c2ecf20Sopenharmony_ci * on disk, thus we need to give the fs a chance to clean up in the event of 468c2ecf20Sopenharmony_ci * an error 478c2ecf20Sopenharmony_ci */ 488c2ecf20Sopenharmony_cistatic void read_cache_pages_invalidate_page(struct address_space *mapping, 498c2ecf20Sopenharmony_ci struct page *page) 508c2ecf20Sopenharmony_ci{ 518c2ecf20Sopenharmony_ci if (page_has_private(page)) { 528c2ecf20Sopenharmony_ci if (!trylock_page(page)) 538c2ecf20Sopenharmony_ci BUG(); 548c2ecf20Sopenharmony_ci page->mapping = mapping; 558c2ecf20Sopenharmony_ci do_invalidatepage(page, 0, PAGE_SIZE); 568c2ecf20Sopenharmony_ci page->mapping = NULL; 578c2ecf20Sopenharmony_ci unlock_page(page); 588c2ecf20Sopenharmony_ci } 598c2ecf20Sopenharmony_ci put_page(page); 608c2ecf20Sopenharmony_ci} 618c2ecf20Sopenharmony_ci 628c2ecf20Sopenharmony_ci/* 638c2ecf20Sopenharmony_ci * release a list of pages, invalidating them first if need be 648c2ecf20Sopenharmony_ci */ 658c2ecf20Sopenharmony_cistatic void read_cache_pages_invalidate_pages(struct address_space *mapping, 668c2ecf20Sopenharmony_ci struct list_head *pages) 678c2ecf20Sopenharmony_ci{ 688c2ecf20Sopenharmony_ci struct page *victim; 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci while (!list_empty(pages)) { 718c2ecf20Sopenharmony_ci victim = lru_to_page(pages); 728c2ecf20Sopenharmony_ci list_del(&victim->lru); 738c2ecf20Sopenharmony_ci read_cache_pages_invalidate_page(mapping, victim); 748c2ecf20Sopenharmony_ci } 758c2ecf20Sopenharmony_ci} 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci/** 788c2ecf20Sopenharmony_ci * read_cache_pages - populate an address space with some pages & start reads against them 798c2ecf20Sopenharmony_ci * @mapping: the address_space 808c2ecf20Sopenharmony_ci * @pages: The address of a list_head which contains the target pages. These 818c2ecf20Sopenharmony_ci * pages have their ->index populated and are otherwise uninitialised. 828c2ecf20Sopenharmony_ci * @filler: callback routine for filling a single page. 838c2ecf20Sopenharmony_ci * @data: private data for the callback routine. 848c2ecf20Sopenharmony_ci * 858c2ecf20Sopenharmony_ci * Hides the details of the LRU cache etc from the filesystems. 868c2ecf20Sopenharmony_ci * 878c2ecf20Sopenharmony_ci * Returns: %0 on success, error return by @filler otherwise 888c2ecf20Sopenharmony_ci */ 898c2ecf20Sopenharmony_ciint read_cache_pages(struct address_space *mapping, struct list_head *pages, 908c2ecf20Sopenharmony_ci int (*filler)(void *, struct page *), void *data) 918c2ecf20Sopenharmony_ci{ 928c2ecf20Sopenharmony_ci struct page *page; 938c2ecf20Sopenharmony_ci int ret = 0; 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci while (!list_empty(pages)) { 968c2ecf20Sopenharmony_ci page = lru_to_page(pages); 978c2ecf20Sopenharmony_ci list_del(&page->lru); 988c2ecf20Sopenharmony_ci if (add_to_page_cache_lru(page, mapping, page->index, 998c2ecf20Sopenharmony_ci readahead_gfp_mask(mapping))) { 1008c2ecf20Sopenharmony_ci read_cache_pages_invalidate_page(mapping, page); 1018c2ecf20Sopenharmony_ci continue; 1028c2ecf20Sopenharmony_ci } 1038c2ecf20Sopenharmony_ci put_page(page); 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_ci ret = filler(data, page); 1068c2ecf20Sopenharmony_ci if (unlikely(ret)) { 1078c2ecf20Sopenharmony_ci read_cache_pages_invalidate_pages(mapping, pages); 1088c2ecf20Sopenharmony_ci break; 1098c2ecf20Sopenharmony_ci } 1108c2ecf20Sopenharmony_ci task_io_account_read(PAGE_SIZE); 1118c2ecf20Sopenharmony_ci } 1128c2ecf20Sopenharmony_ci return ret; 1138c2ecf20Sopenharmony_ci} 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ciEXPORT_SYMBOL(read_cache_pages); 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_cistatic void read_pages(struct readahead_control *rac, struct list_head *pages, 1188c2ecf20Sopenharmony_ci bool skip_page) 1198c2ecf20Sopenharmony_ci{ 1208c2ecf20Sopenharmony_ci const struct address_space_operations *aops = rac->mapping->a_ops; 1218c2ecf20Sopenharmony_ci struct page *page; 1228c2ecf20Sopenharmony_ci struct blk_plug plug; 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci if (!readahead_count(rac)) 1258c2ecf20Sopenharmony_ci goto out; 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci blk_start_plug(&plug); 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci if (aops->readahead) { 1308c2ecf20Sopenharmony_ci aops->readahead(rac); 1318c2ecf20Sopenharmony_ci /* Clean up the remaining pages */ 1328c2ecf20Sopenharmony_ci while ((page = readahead_page(rac))) { 1338c2ecf20Sopenharmony_ci unlock_page(page); 1348c2ecf20Sopenharmony_ci put_page(page); 1358c2ecf20Sopenharmony_ci } 1368c2ecf20Sopenharmony_ci } else if (aops->readpages) { 1378c2ecf20Sopenharmony_ci aops->readpages(rac->file, rac->mapping, pages, 1388c2ecf20Sopenharmony_ci readahead_count(rac)); 1398c2ecf20Sopenharmony_ci /* Clean up the remaining pages */ 1408c2ecf20Sopenharmony_ci put_pages_list(pages); 1418c2ecf20Sopenharmony_ci rac->_index += rac->_nr_pages; 1428c2ecf20Sopenharmony_ci rac->_nr_pages = 0; 1438c2ecf20Sopenharmony_ci } else { 1448c2ecf20Sopenharmony_ci while ((page = readahead_page(rac))) { 1458c2ecf20Sopenharmony_ci aops->readpage(rac->file, page); 1468c2ecf20Sopenharmony_ci put_page(page); 1478c2ecf20Sopenharmony_ci } 1488c2ecf20Sopenharmony_ci } 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci blk_finish_plug(&plug); 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci BUG_ON(!list_empty(pages)); 1538c2ecf20Sopenharmony_ci BUG_ON(readahead_count(rac)); 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ciout: 1568c2ecf20Sopenharmony_ci if (skip_page) 1578c2ecf20Sopenharmony_ci rac->_index++; 1588c2ecf20Sopenharmony_ci} 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ci/** 1618c2ecf20Sopenharmony_ci * page_cache_ra_unbounded - Start unchecked readahead. 1628c2ecf20Sopenharmony_ci * @ractl: Readahead control. 1638c2ecf20Sopenharmony_ci * @nr_to_read: The number of pages to read. 1648c2ecf20Sopenharmony_ci * @lookahead_size: Where to start the next readahead. 1658c2ecf20Sopenharmony_ci * 1668c2ecf20Sopenharmony_ci * This function is for filesystems to call when they want to start 1678c2ecf20Sopenharmony_ci * readahead beyond a file's stated i_size. This is almost certainly 1688c2ecf20Sopenharmony_ci * not the function you want to call. Use page_cache_async_readahead() 1698c2ecf20Sopenharmony_ci * or page_cache_sync_readahead() instead. 1708c2ecf20Sopenharmony_ci * 1718c2ecf20Sopenharmony_ci * Context: File is referenced by caller. Mutexes may be held by caller. 1728c2ecf20Sopenharmony_ci * May sleep, but will not reenter filesystem to reclaim memory. 1738c2ecf20Sopenharmony_ci */ 1748c2ecf20Sopenharmony_civoid page_cache_ra_unbounded(struct readahead_control *ractl, 1758c2ecf20Sopenharmony_ci unsigned long nr_to_read, unsigned long lookahead_size) 1768c2ecf20Sopenharmony_ci{ 1778c2ecf20Sopenharmony_ci struct address_space *mapping = ractl->mapping; 1788c2ecf20Sopenharmony_ci unsigned long index = readahead_index(ractl); 1798c2ecf20Sopenharmony_ci LIST_HEAD(page_pool); 1808c2ecf20Sopenharmony_ci gfp_t gfp_mask = readahead_gfp_mask(mapping); 1818c2ecf20Sopenharmony_ci unsigned long i; 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci /* 1848c2ecf20Sopenharmony_ci * Partway through the readahead operation, we will have added 1858c2ecf20Sopenharmony_ci * locked pages to the page cache, but will not yet have submitted 1868c2ecf20Sopenharmony_ci * them for I/O. Adding another page may need to allocate memory, 1878c2ecf20Sopenharmony_ci * which can trigger memory reclaim. Telling the VM we're in 1888c2ecf20Sopenharmony_ci * the middle of a filesystem operation will cause it to not 1898c2ecf20Sopenharmony_ci * touch file-backed pages, preventing a deadlock. Most (all?) 1908c2ecf20Sopenharmony_ci * filesystems already specify __GFP_NOFS in their mapping's 1918c2ecf20Sopenharmony_ci * gfp_mask, but let's be explicit here. 1928c2ecf20Sopenharmony_ci */ 1938c2ecf20Sopenharmony_ci unsigned int nofs = memalloc_nofs_save(); 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci /* 1968c2ecf20Sopenharmony_ci * Preallocate as many pages as we will need. 1978c2ecf20Sopenharmony_ci */ 1988c2ecf20Sopenharmony_ci for (i = 0; i < nr_to_read; i++) { 1998c2ecf20Sopenharmony_ci struct page *page = xa_load(&mapping->i_pages, index + i); 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_ci BUG_ON(index + i != ractl->_index + ractl->_nr_pages); 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_ci if (page && !xa_is_value(page)) { 2048c2ecf20Sopenharmony_ci /* 2058c2ecf20Sopenharmony_ci * Page already present? Kick off the current batch 2068c2ecf20Sopenharmony_ci * of contiguous pages before continuing with the 2078c2ecf20Sopenharmony_ci * next batch. This page may be the one we would 2088c2ecf20Sopenharmony_ci * have intended to mark as Readahead, but we don't 2098c2ecf20Sopenharmony_ci * have a stable reference to this page, and it's 2108c2ecf20Sopenharmony_ci * not worth getting one just for that. 2118c2ecf20Sopenharmony_ci */ 2128c2ecf20Sopenharmony_ci read_pages(ractl, &page_pool, true); 2138c2ecf20Sopenharmony_ci continue; 2148c2ecf20Sopenharmony_ci } 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci page = __page_cache_alloc(gfp_mask); 2178c2ecf20Sopenharmony_ci if (!page) 2188c2ecf20Sopenharmony_ci break; 2198c2ecf20Sopenharmony_ci if (mapping->a_ops->readpages) { 2208c2ecf20Sopenharmony_ci page->index = index + i; 2218c2ecf20Sopenharmony_ci list_add(&page->lru, &page_pool); 2228c2ecf20Sopenharmony_ci } else if (add_to_page_cache_lru(page, mapping, index + i, 2238c2ecf20Sopenharmony_ci gfp_mask) < 0) { 2248c2ecf20Sopenharmony_ci put_page(page); 2258c2ecf20Sopenharmony_ci read_pages(ractl, &page_pool, true); 2268c2ecf20Sopenharmony_ci continue; 2278c2ecf20Sopenharmony_ci } 2288c2ecf20Sopenharmony_ci if (i == nr_to_read - lookahead_size) 2298c2ecf20Sopenharmony_ci SetPageReadahead(page); 2308c2ecf20Sopenharmony_ci ractl->_nr_pages++; 2318c2ecf20Sopenharmony_ci } 2328c2ecf20Sopenharmony_ci 2338c2ecf20Sopenharmony_ci /* 2348c2ecf20Sopenharmony_ci * Now start the IO. We ignore I/O errors - if the page is not 2358c2ecf20Sopenharmony_ci * uptodate then the caller will launch readpage again, and 2368c2ecf20Sopenharmony_ci * will then handle the error. 2378c2ecf20Sopenharmony_ci */ 2388c2ecf20Sopenharmony_ci read_pages(ractl, &page_pool, false); 2398c2ecf20Sopenharmony_ci memalloc_nofs_restore(nofs); 2408c2ecf20Sopenharmony_ci} 2418c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(page_cache_ra_unbounded); 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ci/* 2448c2ecf20Sopenharmony_ci * do_page_cache_ra() actually reads a chunk of disk. It allocates 2458c2ecf20Sopenharmony_ci * the pages first, then submits them for I/O. This avoids the very bad 2468c2ecf20Sopenharmony_ci * behaviour which would occur if page allocations are causing VM writeback. 2478c2ecf20Sopenharmony_ci * We really don't want to intermingle reads and writes like that. 2488c2ecf20Sopenharmony_ci */ 2498c2ecf20Sopenharmony_civoid do_page_cache_ra(struct readahead_control *ractl, 2508c2ecf20Sopenharmony_ci unsigned long nr_to_read, unsigned long lookahead_size) 2518c2ecf20Sopenharmony_ci{ 2528c2ecf20Sopenharmony_ci struct inode *inode = ractl->mapping->host; 2538c2ecf20Sopenharmony_ci unsigned long index = readahead_index(ractl); 2548c2ecf20Sopenharmony_ci loff_t isize = i_size_read(inode); 2558c2ecf20Sopenharmony_ci pgoff_t end_index; /* The last page we want to read */ 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_ci if (isize == 0) 2588c2ecf20Sopenharmony_ci return; 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci end_index = (isize - 1) >> PAGE_SHIFT; 2618c2ecf20Sopenharmony_ci if (index > end_index) 2628c2ecf20Sopenharmony_ci return; 2638c2ecf20Sopenharmony_ci /* Don't read past the page containing the last byte of the file */ 2648c2ecf20Sopenharmony_ci if (nr_to_read > end_index - index) 2658c2ecf20Sopenharmony_ci nr_to_read = end_index - index + 1; 2668c2ecf20Sopenharmony_ci 2678c2ecf20Sopenharmony_ci page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); 2688c2ecf20Sopenharmony_ci} 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_ci/* 2718c2ecf20Sopenharmony_ci * Chunk the readahead into 2 megabyte units, so that we don't pin too much 2728c2ecf20Sopenharmony_ci * memory at once. 2738c2ecf20Sopenharmony_ci */ 2748c2ecf20Sopenharmony_civoid force_page_cache_ra(struct readahead_control *ractl, 2758c2ecf20Sopenharmony_ci struct file_ra_state *ra, unsigned long nr_to_read) 2768c2ecf20Sopenharmony_ci{ 2778c2ecf20Sopenharmony_ci struct address_space *mapping = ractl->mapping; 2788c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = inode_to_bdi(mapping->host); 2798c2ecf20Sopenharmony_ci unsigned long max_pages, index; 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && 2828c2ecf20Sopenharmony_ci !mapping->a_ops->readahead)) 2838c2ecf20Sopenharmony_ci return; 2848c2ecf20Sopenharmony_ci 2858c2ecf20Sopenharmony_ci /* 2868c2ecf20Sopenharmony_ci * If the request exceeds the readahead window, allow the read to 2878c2ecf20Sopenharmony_ci * be up to the optimal hardware IO size 2888c2ecf20Sopenharmony_ci */ 2898c2ecf20Sopenharmony_ci index = readahead_index(ractl); 2908c2ecf20Sopenharmony_ci max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); 2918c2ecf20Sopenharmony_ci nr_to_read = min_t(unsigned long, nr_to_read, max_pages); 2928c2ecf20Sopenharmony_ci while (nr_to_read) { 2938c2ecf20Sopenharmony_ci unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci if (this_chunk > nr_to_read) 2968c2ecf20Sopenharmony_ci this_chunk = nr_to_read; 2978c2ecf20Sopenharmony_ci ractl->_index = index; 2988c2ecf20Sopenharmony_ci do_page_cache_ra(ractl, this_chunk, 0); 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci index += this_chunk; 3018c2ecf20Sopenharmony_ci nr_to_read -= this_chunk; 3028c2ecf20Sopenharmony_ci } 3038c2ecf20Sopenharmony_ci} 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci/* 3068c2ecf20Sopenharmony_ci * Set the initial window size, round to next power of 2 and square 3078c2ecf20Sopenharmony_ci * for small size, x 4 for medium, and x 2 for large 3088c2ecf20Sopenharmony_ci * for 128k (32 page) max ra 3098c2ecf20Sopenharmony_ci * 1-8 page = 32k initial, > 8 page = 128k initial 3108c2ecf20Sopenharmony_ci */ 3118c2ecf20Sopenharmony_cistatic unsigned long get_init_ra_size(unsigned long size, unsigned long max) 3128c2ecf20Sopenharmony_ci{ 3138c2ecf20Sopenharmony_ci unsigned long newsize = roundup_pow_of_two(size); 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci if (newsize <= max / 32) 3168c2ecf20Sopenharmony_ci newsize = newsize * 4; 3178c2ecf20Sopenharmony_ci else if (newsize <= max / 4) 3188c2ecf20Sopenharmony_ci newsize = newsize * 2; 3198c2ecf20Sopenharmony_ci else 3208c2ecf20Sopenharmony_ci newsize = max; 3218c2ecf20Sopenharmony_ci 3228c2ecf20Sopenharmony_ci return newsize; 3238c2ecf20Sopenharmony_ci} 3248c2ecf20Sopenharmony_ci 3258c2ecf20Sopenharmony_ci/* 3268c2ecf20Sopenharmony_ci * Get the previous window size, ramp it up, and 3278c2ecf20Sopenharmony_ci * return it as the new window size. 3288c2ecf20Sopenharmony_ci */ 3298c2ecf20Sopenharmony_cistatic unsigned long get_next_ra_size(struct file_ra_state *ra, 3308c2ecf20Sopenharmony_ci unsigned long max) 3318c2ecf20Sopenharmony_ci{ 3328c2ecf20Sopenharmony_ci unsigned long cur = ra->size; 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_ci if (cur < max / 16) 3358c2ecf20Sopenharmony_ci return 4 * cur; 3368c2ecf20Sopenharmony_ci if (cur <= max / 2) 3378c2ecf20Sopenharmony_ci return 2 * cur; 3388c2ecf20Sopenharmony_ci return max; 3398c2ecf20Sopenharmony_ci} 3408c2ecf20Sopenharmony_ci 3418c2ecf20Sopenharmony_ci/* 3428c2ecf20Sopenharmony_ci * On-demand readahead design. 3438c2ecf20Sopenharmony_ci * 3448c2ecf20Sopenharmony_ci * The fields in struct file_ra_state represent the most-recently-executed 3458c2ecf20Sopenharmony_ci * readahead attempt: 3468c2ecf20Sopenharmony_ci * 3478c2ecf20Sopenharmony_ci * |<----- async_size ---------| 3488c2ecf20Sopenharmony_ci * |------------------- size -------------------->| 3498c2ecf20Sopenharmony_ci * |==================#===========================| 3508c2ecf20Sopenharmony_ci * ^start ^page marked with PG_readahead 3518c2ecf20Sopenharmony_ci * 3528c2ecf20Sopenharmony_ci * To overlap application thinking time and disk I/O time, we do 3538c2ecf20Sopenharmony_ci * `readahead pipelining': Do not wait until the application consumed all 3548c2ecf20Sopenharmony_ci * readahead pages and stalled on the missing page at readahead_index; 3558c2ecf20Sopenharmony_ci * Instead, submit an asynchronous readahead I/O as soon as there are 3568c2ecf20Sopenharmony_ci * only async_size pages left in the readahead window. Normally async_size 3578c2ecf20Sopenharmony_ci * will be equal to size, for maximum pipelining. 3588c2ecf20Sopenharmony_ci * 3598c2ecf20Sopenharmony_ci * In interleaved sequential reads, concurrent streams on the same fd can 3608c2ecf20Sopenharmony_ci * be invalidating each other's readahead state. So we flag the new readahead 3618c2ecf20Sopenharmony_ci * page at (start+size-async_size) with PG_readahead, and use it as readahead 3628c2ecf20Sopenharmony_ci * indicator. The flag won't be set on already cached pages, to avoid the 3638c2ecf20Sopenharmony_ci * readahead-for-nothing fuss, saving pointless page cache lookups. 3648c2ecf20Sopenharmony_ci * 3658c2ecf20Sopenharmony_ci * prev_pos tracks the last visited byte in the _previous_ read request. 3668c2ecf20Sopenharmony_ci * It should be maintained by the caller, and will be used for detecting 3678c2ecf20Sopenharmony_ci * small random reads. Note that the readahead algorithm checks loosely 3688c2ecf20Sopenharmony_ci * for sequential patterns. Hence interleaved reads might be served as 3698c2ecf20Sopenharmony_ci * sequential ones. 3708c2ecf20Sopenharmony_ci * 3718c2ecf20Sopenharmony_ci * There is a special-case: if the first page which the application tries to 3728c2ecf20Sopenharmony_ci * read happens to be the first page of the file, it is assumed that a linear 3738c2ecf20Sopenharmony_ci * read is about to happen and the window is immediately set to the initial size 3748c2ecf20Sopenharmony_ci * based on I/O request size and the max_readahead. 3758c2ecf20Sopenharmony_ci * 3768c2ecf20Sopenharmony_ci * The code ramps up the readahead size aggressively at first, but slow down as 3778c2ecf20Sopenharmony_ci * it approaches max_readhead. 3788c2ecf20Sopenharmony_ci */ 3798c2ecf20Sopenharmony_ci 3808c2ecf20Sopenharmony_ci/* 3818c2ecf20Sopenharmony_ci * Count contiguously cached pages from @index-1 to @index-@max, 3828c2ecf20Sopenharmony_ci * this count is a conservative estimation of 3838c2ecf20Sopenharmony_ci * - length of the sequential read sequence, or 3848c2ecf20Sopenharmony_ci * - thrashing threshold in memory tight systems 3858c2ecf20Sopenharmony_ci */ 3868c2ecf20Sopenharmony_cistatic pgoff_t count_history_pages(struct address_space *mapping, 3878c2ecf20Sopenharmony_ci pgoff_t index, unsigned long max) 3888c2ecf20Sopenharmony_ci{ 3898c2ecf20Sopenharmony_ci pgoff_t head; 3908c2ecf20Sopenharmony_ci 3918c2ecf20Sopenharmony_ci rcu_read_lock(); 3928c2ecf20Sopenharmony_ci head = page_cache_prev_miss(mapping, index - 1, max); 3938c2ecf20Sopenharmony_ci rcu_read_unlock(); 3948c2ecf20Sopenharmony_ci 3958c2ecf20Sopenharmony_ci return index - 1 - head; 3968c2ecf20Sopenharmony_ci} 3978c2ecf20Sopenharmony_ci 3988c2ecf20Sopenharmony_ci/* 3998c2ecf20Sopenharmony_ci * page cache context based read-ahead 4008c2ecf20Sopenharmony_ci */ 4018c2ecf20Sopenharmony_cistatic int try_context_readahead(struct address_space *mapping, 4028c2ecf20Sopenharmony_ci struct file_ra_state *ra, 4038c2ecf20Sopenharmony_ci pgoff_t index, 4048c2ecf20Sopenharmony_ci unsigned long req_size, 4058c2ecf20Sopenharmony_ci unsigned long max) 4068c2ecf20Sopenharmony_ci{ 4078c2ecf20Sopenharmony_ci pgoff_t size; 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_ci size = count_history_pages(mapping, index, max); 4108c2ecf20Sopenharmony_ci 4118c2ecf20Sopenharmony_ci /* 4128c2ecf20Sopenharmony_ci * not enough history pages: 4138c2ecf20Sopenharmony_ci * it could be a random read 4148c2ecf20Sopenharmony_ci */ 4158c2ecf20Sopenharmony_ci if (size <= req_size) 4168c2ecf20Sopenharmony_ci return 0; 4178c2ecf20Sopenharmony_ci 4188c2ecf20Sopenharmony_ci /* 4198c2ecf20Sopenharmony_ci * starts from beginning of file: 4208c2ecf20Sopenharmony_ci * it is a strong indication of long-run stream (or whole-file-read) 4218c2ecf20Sopenharmony_ci */ 4228c2ecf20Sopenharmony_ci if (size >= index) 4238c2ecf20Sopenharmony_ci size *= 2; 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci ra->start = index; 4268c2ecf20Sopenharmony_ci ra->size = min(size + req_size, max); 4278c2ecf20Sopenharmony_ci ra->async_size = 1; 4288c2ecf20Sopenharmony_ci 4298c2ecf20Sopenharmony_ci return 1; 4308c2ecf20Sopenharmony_ci} 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci/* 4338c2ecf20Sopenharmony_ci * A minimal readahead algorithm for trivial sequential/random reads. 4348c2ecf20Sopenharmony_ci */ 4358c2ecf20Sopenharmony_cistatic void ondemand_readahead(struct readahead_control *ractl, 4368c2ecf20Sopenharmony_ci struct file_ra_state *ra, bool hit_readahead_marker, 4378c2ecf20Sopenharmony_ci unsigned long req_size) 4388c2ecf20Sopenharmony_ci{ 4398c2ecf20Sopenharmony_ci struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); 4408c2ecf20Sopenharmony_ci unsigned long max_pages = ra->ra_pages; 4418c2ecf20Sopenharmony_ci unsigned long add_pages; 4428c2ecf20Sopenharmony_ci unsigned long index = readahead_index(ractl); 4438c2ecf20Sopenharmony_ci pgoff_t prev_index; 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci /* 4468c2ecf20Sopenharmony_ci * If the request exceeds the readahead window, allow the read to 4478c2ecf20Sopenharmony_ci * be up to the optimal hardware IO size 4488c2ecf20Sopenharmony_ci */ 4498c2ecf20Sopenharmony_ci if (req_size > max_pages && bdi->io_pages > max_pages) 4508c2ecf20Sopenharmony_ci max_pages = min(req_size, bdi->io_pages); 4518c2ecf20Sopenharmony_ci 4528c2ecf20Sopenharmony_ci /* 4538c2ecf20Sopenharmony_ci * start of file 4548c2ecf20Sopenharmony_ci */ 4558c2ecf20Sopenharmony_ci if (!index) 4568c2ecf20Sopenharmony_ci goto initial_readahead; 4578c2ecf20Sopenharmony_ci 4588c2ecf20Sopenharmony_ci /* 4598c2ecf20Sopenharmony_ci * It's the expected callback index, assume sequential access. 4608c2ecf20Sopenharmony_ci * Ramp up sizes, and push forward the readahead window. 4618c2ecf20Sopenharmony_ci */ 4628c2ecf20Sopenharmony_ci if ((index == (ra->start + ra->size - ra->async_size) || 4638c2ecf20Sopenharmony_ci index == (ra->start + ra->size))) { 4648c2ecf20Sopenharmony_ci ra->start += ra->size; 4658c2ecf20Sopenharmony_ci ra->size = get_next_ra_size(ra, max_pages); 4668c2ecf20Sopenharmony_ci ra->async_size = ra->size; 4678c2ecf20Sopenharmony_ci goto readit; 4688c2ecf20Sopenharmony_ci } 4698c2ecf20Sopenharmony_ci 4708c2ecf20Sopenharmony_ci /* 4718c2ecf20Sopenharmony_ci * Hit a marked page without valid readahead state. 4728c2ecf20Sopenharmony_ci * E.g. interleaved reads. 4738c2ecf20Sopenharmony_ci * Query the pagecache for async_size, which normally equals to 4748c2ecf20Sopenharmony_ci * readahead size. Ramp it up and use it as the new readahead size. 4758c2ecf20Sopenharmony_ci */ 4768c2ecf20Sopenharmony_ci if (hit_readahead_marker) { 4778c2ecf20Sopenharmony_ci pgoff_t start; 4788c2ecf20Sopenharmony_ci 4798c2ecf20Sopenharmony_ci rcu_read_lock(); 4808c2ecf20Sopenharmony_ci start = page_cache_next_miss(ractl->mapping, index + 1, 4818c2ecf20Sopenharmony_ci max_pages); 4828c2ecf20Sopenharmony_ci rcu_read_unlock(); 4838c2ecf20Sopenharmony_ci 4848c2ecf20Sopenharmony_ci if (!start || start - index > max_pages) 4858c2ecf20Sopenharmony_ci return; 4868c2ecf20Sopenharmony_ci 4878c2ecf20Sopenharmony_ci ra->start = start; 4888c2ecf20Sopenharmony_ci ra->size = start - index; /* old async_size */ 4898c2ecf20Sopenharmony_ci ra->size += req_size; 4908c2ecf20Sopenharmony_ci ra->size = get_next_ra_size(ra, max_pages); 4918c2ecf20Sopenharmony_ci ra->async_size = ra->size; 4928c2ecf20Sopenharmony_ci goto readit; 4938c2ecf20Sopenharmony_ci } 4948c2ecf20Sopenharmony_ci 4958c2ecf20Sopenharmony_ci /* 4968c2ecf20Sopenharmony_ci * oversize read 4978c2ecf20Sopenharmony_ci */ 4988c2ecf20Sopenharmony_ci if (req_size > max_pages) 4998c2ecf20Sopenharmony_ci goto initial_readahead; 5008c2ecf20Sopenharmony_ci 5018c2ecf20Sopenharmony_ci /* 5028c2ecf20Sopenharmony_ci * sequential cache miss 5038c2ecf20Sopenharmony_ci * trivial case: (index - prev_index) == 1 5048c2ecf20Sopenharmony_ci * unaligned reads: (index - prev_index) == 0 5058c2ecf20Sopenharmony_ci */ 5068c2ecf20Sopenharmony_ci prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; 5078c2ecf20Sopenharmony_ci if (index - prev_index <= 1UL) 5088c2ecf20Sopenharmony_ci goto initial_readahead; 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci /* 5118c2ecf20Sopenharmony_ci * Query the page cache and look for the traces(cached history pages) 5128c2ecf20Sopenharmony_ci * that a sequential stream would leave behind. 5138c2ecf20Sopenharmony_ci */ 5148c2ecf20Sopenharmony_ci if (try_context_readahead(ractl->mapping, ra, index, req_size, 5158c2ecf20Sopenharmony_ci max_pages)) 5168c2ecf20Sopenharmony_ci goto readit; 5178c2ecf20Sopenharmony_ci 5188c2ecf20Sopenharmony_ci /* 5198c2ecf20Sopenharmony_ci * standalone, small random read 5208c2ecf20Sopenharmony_ci * Read as is, and do not pollute the readahead state. 5218c2ecf20Sopenharmony_ci */ 5228c2ecf20Sopenharmony_ci do_page_cache_ra(ractl, req_size, 0); 5238c2ecf20Sopenharmony_ci return; 5248c2ecf20Sopenharmony_ci 5258c2ecf20Sopenharmony_ciinitial_readahead: 5268c2ecf20Sopenharmony_ci ra->start = index; 5278c2ecf20Sopenharmony_ci ra->size = get_init_ra_size(req_size, max_pages); 5288c2ecf20Sopenharmony_ci ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 5298c2ecf20Sopenharmony_ci 5308c2ecf20Sopenharmony_cireadit: 5318c2ecf20Sopenharmony_ci /* 5328c2ecf20Sopenharmony_ci * Will this read hit the readahead marker made by itself? 5338c2ecf20Sopenharmony_ci * If so, trigger the readahead marker hit now, and merge 5348c2ecf20Sopenharmony_ci * the resulted next readahead window into the current one. 5358c2ecf20Sopenharmony_ci * Take care of maximum IO pages as above. 5368c2ecf20Sopenharmony_ci */ 5378c2ecf20Sopenharmony_ci if (index == ra->start && ra->size == ra->async_size) { 5388c2ecf20Sopenharmony_ci add_pages = get_next_ra_size(ra, max_pages); 5398c2ecf20Sopenharmony_ci if (ra->size + add_pages <= max_pages) { 5408c2ecf20Sopenharmony_ci ra->async_size = add_pages; 5418c2ecf20Sopenharmony_ci ra->size += add_pages; 5428c2ecf20Sopenharmony_ci } else { 5438c2ecf20Sopenharmony_ci ra->size = max_pages; 5448c2ecf20Sopenharmony_ci ra->async_size = max_pages >> 1; 5458c2ecf20Sopenharmony_ci } 5468c2ecf20Sopenharmony_ci } 5478c2ecf20Sopenharmony_ci 5488c2ecf20Sopenharmony_ci ractl->_index = ra->start; 5498c2ecf20Sopenharmony_ci do_page_cache_ra(ractl, ra->size, ra->async_size); 5508c2ecf20Sopenharmony_ci} 5518c2ecf20Sopenharmony_ci 5528c2ecf20Sopenharmony_civoid page_cache_sync_ra(struct readahead_control *ractl, 5538c2ecf20Sopenharmony_ci struct file_ra_state *ra, unsigned long req_count) 5548c2ecf20Sopenharmony_ci{ 5558c2ecf20Sopenharmony_ci bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci /* 5588c2ecf20Sopenharmony_ci * Even if read-ahead is disabled, issue this request as read-ahead 5598c2ecf20Sopenharmony_ci * as we'll need it to satisfy the requested range. The forced 5608c2ecf20Sopenharmony_ci * read-ahead will do the right thing and limit the read to just the 5618c2ecf20Sopenharmony_ci * requested range, which we'll set to 1 page for this case. 5628c2ecf20Sopenharmony_ci */ 5638c2ecf20Sopenharmony_ci if (!ra->ra_pages || blk_cgroup_congested()) { 5648c2ecf20Sopenharmony_ci if (!ractl->file) 5658c2ecf20Sopenharmony_ci return; 5668c2ecf20Sopenharmony_ci req_count = 1; 5678c2ecf20Sopenharmony_ci do_forced_ra = true; 5688c2ecf20Sopenharmony_ci } 5698c2ecf20Sopenharmony_ci 5708c2ecf20Sopenharmony_ci /* be dumb */ 5718c2ecf20Sopenharmony_ci if (do_forced_ra) { 5728c2ecf20Sopenharmony_ci force_page_cache_ra(ractl, ra, req_count); 5738c2ecf20Sopenharmony_ci return; 5748c2ecf20Sopenharmony_ci } 5758c2ecf20Sopenharmony_ci 5768c2ecf20Sopenharmony_ci /* do read-ahead */ 5778c2ecf20Sopenharmony_ci ondemand_readahead(ractl, ra, false, req_count); 5788c2ecf20Sopenharmony_ci} 5798c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(page_cache_sync_ra); 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_civoid page_cache_async_ra(struct readahead_control *ractl, 5828c2ecf20Sopenharmony_ci struct file_ra_state *ra, struct page *page, 5838c2ecf20Sopenharmony_ci unsigned long req_count) 5848c2ecf20Sopenharmony_ci{ 5858c2ecf20Sopenharmony_ci /* no read-ahead */ 5868c2ecf20Sopenharmony_ci if (!ra->ra_pages) 5878c2ecf20Sopenharmony_ci return; 5888c2ecf20Sopenharmony_ci 5898c2ecf20Sopenharmony_ci /* 5908c2ecf20Sopenharmony_ci * Same bit is used for PG_readahead and PG_reclaim. 5918c2ecf20Sopenharmony_ci */ 5928c2ecf20Sopenharmony_ci if (PageWriteback(page)) 5938c2ecf20Sopenharmony_ci return; 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci ClearPageReadahead(page); 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_ci /* 5988c2ecf20Sopenharmony_ci * Defer asynchronous read-ahead on IO congestion. 5998c2ecf20Sopenharmony_ci */ 6008c2ecf20Sopenharmony_ci if (inode_read_congested(ractl->mapping->host)) 6018c2ecf20Sopenharmony_ci return; 6028c2ecf20Sopenharmony_ci 6038c2ecf20Sopenharmony_ci if (blk_cgroup_congested()) 6048c2ecf20Sopenharmony_ci return; 6058c2ecf20Sopenharmony_ci 6068c2ecf20Sopenharmony_ci /* do read-ahead */ 6078c2ecf20Sopenharmony_ci ondemand_readahead(ractl, ra, true, req_count); 6088c2ecf20Sopenharmony_ci} 6098c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(page_cache_async_ra); 6108c2ecf20Sopenharmony_ci 6118c2ecf20Sopenharmony_cissize_t ksys_readahead(int fd, loff_t offset, size_t count) 6128c2ecf20Sopenharmony_ci{ 6138c2ecf20Sopenharmony_ci ssize_t ret; 6148c2ecf20Sopenharmony_ci struct fd f; 6158c2ecf20Sopenharmony_ci 6168c2ecf20Sopenharmony_ci ret = -EBADF; 6178c2ecf20Sopenharmony_ci f = fdget(fd); 6188c2ecf20Sopenharmony_ci if (!f.file || !(f.file->f_mode & FMODE_READ)) 6198c2ecf20Sopenharmony_ci goto out; 6208c2ecf20Sopenharmony_ci 6218c2ecf20Sopenharmony_ci /* 6228c2ecf20Sopenharmony_ci * The readahead() syscall is intended to run only on files 6238c2ecf20Sopenharmony_ci * that can execute readahead. If readahead is not possible 6248c2ecf20Sopenharmony_ci * on this file, then we must return -EINVAL. 6258c2ecf20Sopenharmony_ci */ 6268c2ecf20Sopenharmony_ci ret = -EINVAL; 6278c2ecf20Sopenharmony_ci if (!f.file->f_mapping || !f.file->f_mapping->a_ops || 6288c2ecf20Sopenharmony_ci (!S_ISREG(file_inode(f.file)->i_mode) && 6298c2ecf20Sopenharmony_ci !S_ISBLK(file_inode(f.file)->i_mode))) 6308c2ecf20Sopenharmony_ci goto out; 6318c2ecf20Sopenharmony_ci 6328c2ecf20Sopenharmony_ci ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); 6338c2ecf20Sopenharmony_ciout: 6348c2ecf20Sopenharmony_ci fdput(f); 6358c2ecf20Sopenharmony_ci return ret; 6368c2ecf20Sopenharmony_ci} 6378c2ecf20Sopenharmony_ci 6388c2ecf20Sopenharmony_ciSYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) 6398c2ecf20Sopenharmony_ci{ 6408c2ecf20Sopenharmony_ci return ksys_readahead(fd, offset, count); 6418c2ecf20Sopenharmony_ci} 642