18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci *	linux/mm/filemap.c
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 1994-1999  Linus Torvalds
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci/*
98c2ecf20Sopenharmony_ci * This file handles the generic file mmap semantics used by
108c2ecf20Sopenharmony_ci * most "normal" filesystems (but you don't /have/ to use this:
118c2ecf20Sopenharmony_ci * the NFS filesystem used to do this differently, for example)
128c2ecf20Sopenharmony_ci */
138c2ecf20Sopenharmony_ci#include <linux/export.h>
148c2ecf20Sopenharmony_ci#include <linux/compiler.h>
158c2ecf20Sopenharmony_ci#include <linux/dax.h>
168c2ecf20Sopenharmony_ci#include <linux/fs.h>
178c2ecf20Sopenharmony_ci#include <linux/sched/signal.h>
188c2ecf20Sopenharmony_ci#include <linux/uaccess.h>
198c2ecf20Sopenharmony_ci#include <linux/capability.h>
208c2ecf20Sopenharmony_ci#include <linux/kernel_stat.h>
218c2ecf20Sopenharmony_ci#include <linux/gfp.h>
228c2ecf20Sopenharmony_ci#include <linux/mm.h>
238c2ecf20Sopenharmony_ci#include <linux/swap.h>
248c2ecf20Sopenharmony_ci#include <linux/mman.h>
258c2ecf20Sopenharmony_ci#include <linux/pagemap.h>
268c2ecf20Sopenharmony_ci#include <linux/file.h>
278c2ecf20Sopenharmony_ci#include <linux/uio.h>
288c2ecf20Sopenharmony_ci#include <linux/error-injection.h>
298c2ecf20Sopenharmony_ci#include <linux/hash.h>
308c2ecf20Sopenharmony_ci#include <linux/writeback.h>
318c2ecf20Sopenharmony_ci#include <linux/backing-dev.h>
328c2ecf20Sopenharmony_ci#include <linux/pagevec.h>
338c2ecf20Sopenharmony_ci#include <linux/blkdev.h>
348c2ecf20Sopenharmony_ci#include <linux/security.h>
358c2ecf20Sopenharmony_ci#include <linux/cpuset.h>
368c2ecf20Sopenharmony_ci#include <linux/hugetlb.h>
378c2ecf20Sopenharmony_ci#include <linux/memcontrol.h>
388c2ecf20Sopenharmony_ci#include <linux/cleancache.h>
398c2ecf20Sopenharmony_ci#include <linux/shmem_fs.h>
408c2ecf20Sopenharmony_ci#include <linux/rmap.h>
418c2ecf20Sopenharmony_ci#include <linux/delayacct.h>
428c2ecf20Sopenharmony_ci#include <linux/psi.h>
438c2ecf20Sopenharmony_ci#include <linux/ramfs.h>
448c2ecf20Sopenharmony_ci#include <linux/page_idle.h>
458c2ecf20Sopenharmony_ci#include "internal.h"
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci#define CREATE_TRACE_POINTS
488c2ecf20Sopenharmony_ci#include <trace/events/filemap.h>
498c2ecf20Sopenharmony_ci
508c2ecf20Sopenharmony_ci/*
518c2ecf20Sopenharmony_ci * FIXME: remove all knowledge of the buffer layer from the core VM
528c2ecf20Sopenharmony_ci */
538c2ecf20Sopenharmony_ci#include <linux/buffer_head.h> /* for try_to_free_buffers */
548c2ecf20Sopenharmony_ci
558c2ecf20Sopenharmony_ci#include <asm/mman.h>
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci/*
588c2ecf20Sopenharmony_ci * Shared mappings implemented 30.11.1994. It's not fully working yet,
598c2ecf20Sopenharmony_ci * though.
608c2ecf20Sopenharmony_ci *
618c2ecf20Sopenharmony_ci * Shared mappings now work. 15.8.1995  Bruno.
628c2ecf20Sopenharmony_ci *
638c2ecf20Sopenharmony_ci * finished 'unifying' the page and buffer cache and SMP-threaded the
648c2ecf20Sopenharmony_ci * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
658c2ecf20Sopenharmony_ci *
668c2ecf20Sopenharmony_ci * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
678c2ecf20Sopenharmony_ci */
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci/*
708c2ecf20Sopenharmony_ci * Lock ordering:
718c2ecf20Sopenharmony_ci *
728c2ecf20Sopenharmony_ci *  ->i_mmap_rwsem		(truncate_pagecache)
738c2ecf20Sopenharmony_ci *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
748c2ecf20Sopenharmony_ci *      ->swap_lock		(exclusive_swap_page, others)
758c2ecf20Sopenharmony_ci *        ->i_pages lock
768c2ecf20Sopenharmony_ci *
778c2ecf20Sopenharmony_ci *  ->i_mutex
788c2ecf20Sopenharmony_ci *    ->i_mmap_rwsem		(truncate->unmap_mapping_range)
798c2ecf20Sopenharmony_ci *
808c2ecf20Sopenharmony_ci *  ->mmap_lock
818c2ecf20Sopenharmony_ci *    ->i_mmap_rwsem
828c2ecf20Sopenharmony_ci *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
838c2ecf20Sopenharmony_ci *        ->i_pages lock	(arch-dependent flush_dcache_mmap_lock)
848c2ecf20Sopenharmony_ci *
858c2ecf20Sopenharmony_ci *  ->mmap_lock
868c2ecf20Sopenharmony_ci *    ->lock_page		(access_process_vm)
878c2ecf20Sopenharmony_ci *
888c2ecf20Sopenharmony_ci *  ->i_mutex			(generic_perform_write)
898c2ecf20Sopenharmony_ci *    ->mmap_lock		(fault_in_pages_readable->do_page_fault)
908c2ecf20Sopenharmony_ci *
918c2ecf20Sopenharmony_ci *  bdi->wb.list_lock
928c2ecf20Sopenharmony_ci *    sb_lock			(fs/fs-writeback.c)
938c2ecf20Sopenharmony_ci *    ->i_pages lock		(__sync_single_inode)
948c2ecf20Sopenharmony_ci *
958c2ecf20Sopenharmony_ci *  ->i_mmap_rwsem
968c2ecf20Sopenharmony_ci *    ->anon_vma.lock		(vma_adjust)
978c2ecf20Sopenharmony_ci *
988c2ecf20Sopenharmony_ci *  ->anon_vma.lock
998c2ecf20Sopenharmony_ci *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
1008c2ecf20Sopenharmony_ci *
1018c2ecf20Sopenharmony_ci *  ->page_table_lock or pte_lock
1028c2ecf20Sopenharmony_ci *    ->swap_lock		(try_to_unmap_one)
1038c2ecf20Sopenharmony_ci *    ->private_lock		(try_to_unmap_one)
1048c2ecf20Sopenharmony_ci *    ->i_pages lock		(try_to_unmap_one)
1058c2ecf20Sopenharmony_ci *    ->pgdat->lru_lock		(follow_page->mark_page_accessed)
1068c2ecf20Sopenharmony_ci *    ->pgdat->lru_lock		(check_pte_range->isolate_lru_page)
1078c2ecf20Sopenharmony_ci *    ->private_lock		(page_remove_rmap->set_page_dirty)
1088c2ecf20Sopenharmony_ci *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
1098c2ecf20Sopenharmony_ci *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
1108c2ecf20Sopenharmony_ci *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
1118c2ecf20Sopenharmony_ci *    ->memcg->move_lock	(page_remove_rmap->lock_page_memcg)
1128c2ecf20Sopenharmony_ci *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
1138c2ecf20Sopenharmony_ci *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
1148c2ecf20Sopenharmony_ci *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
1158c2ecf20Sopenharmony_ci *
1168c2ecf20Sopenharmony_ci * ->i_mmap_rwsem
1178c2ecf20Sopenharmony_ci *   ->tasklist_lock            (memory_failure, collect_procs_ao)
1188c2ecf20Sopenharmony_ci */
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_cistatic void page_cache_delete(struct address_space *mapping,
1218c2ecf20Sopenharmony_ci				   struct page *page, void *shadow)
1228c2ecf20Sopenharmony_ci{
1238c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, page->index);
1248c2ecf20Sopenharmony_ci	unsigned int nr = 1;
1258c2ecf20Sopenharmony_ci
1268c2ecf20Sopenharmony_ci	mapping_set_update(&xas, mapping);
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_ci	/* hugetlb pages are represented by a single entry in the xarray */
1298c2ecf20Sopenharmony_ci	if (!PageHuge(page)) {
1308c2ecf20Sopenharmony_ci		xas_set_order(&xas, page->index, compound_order(page));
1318c2ecf20Sopenharmony_ci		nr = compound_nr(page);
1328c2ecf20Sopenharmony_ci	}
1338c2ecf20Sopenharmony_ci
1348c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!PageLocked(page), page);
1358c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(PageTail(page), page);
1368c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(nr != 1 && shadow, page);
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci	xas_store(&xas, shadow);
1398c2ecf20Sopenharmony_ci	xas_init_marks(&xas);
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_ci	page->mapping = NULL;
1428c2ecf20Sopenharmony_ci	/* Leave page->index set: truncation lookup relies upon it */
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci	if (shadow) {
1458c2ecf20Sopenharmony_ci		mapping->nrexceptional += nr;
1468c2ecf20Sopenharmony_ci		/*
1478c2ecf20Sopenharmony_ci		 * Make sure the nrexceptional update is committed before
1488c2ecf20Sopenharmony_ci		 * the nrpages update so that final truncate racing
1498c2ecf20Sopenharmony_ci		 * with reclaim does not see both counters 0 at the
1508c2ecf20Sopenharmony_ci		 * same time and miss a shadow entry.
1518c2ecf20Sopenharmony_ci		 */
1528c2ecf20Sopenharmony_ci		smp_wmb();
1538c2ecf20Sopenharmony_ci	}
1548c2ecf20Sopenharmony_ci	mapping->nrpages -= nr;
1558c2ecf20Sopenharmony_ci}
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_cistatic void unaccount_page_cache_page(struct address_space *mapping,
1588c2ecf20Sopenharmony_ci				      struct page *page)
1598c2ecf20Sopenharmony_ci{
1608c2ecf20Sopenharmony_ci	int nr;
1618c2ecf20Sopenharmony_ci
1628c2ecf20Sopenharmony_ci	/*
1638c2ecf20Sopenharmony_ci	 * if we're uptodate, flush out into the cleancache, otherwise
1648c2ecf20Sopenharmony_ci	 * invalidate any existing cleancache entries.  We can't leave
1658c2ecf20Sopenharmony_ci	 * stale data around in the cleancache once our page is gone
1668c2ecf20Sopenharmony_ci	 */
1678c2ecf20Sopenharmony_ci	if (PageUptodate(page) && PageMappedToDisk(page))
1688c2ecf20Sopenharmony_ci		cleancache_put_page(page);
1698c2ecf20Sopenharmony_ci	else
1708c2ecf20Sopenharmony_ci		cleancache_invalidate_page(mapping, page);
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(PageTail(page), page);
1738c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(page_mapped(page), page);
1748c2ecf20Sopenharmony_ci	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
1758c2ecf20Sopenharmony_ci		int mapcount;
1768c2ecf20Sopenharmony_ci
1778c2ecf20Sopenharmony_ci		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
1788c2ecf20Sopenharmony_ci			 current->comm, page_to_pfn(page));
1798c2ecf20Sopenharmony_ci		dump_page(page, "still mapped when deleted");
1808c2ecf20Sopenharmony_ci		dump_stack();
1818c2ecf20Sopenharmony_ci		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci		mapcount = page_mapcount(page);
1848c2ecf20Sopenharmony_ci		if (mapping_exiting(mapping) &&
1858c2ecf20Sopenharmony_ci		    page_count(page) >= mapcount + 2) {
1868c2ecf20Sopenharmony_ci			/*
1878c2ecf20Sopenharmony_ci			 * All vmas have already been torn down, so it's
1888c2ecf20Sopenharmony_ci			 * a good bet that actually the page is unmapped,
1898c2ecf20Sopenharmony_ci			 * and we'd prefer not to leak it: if we're wrong,
1908c2ecf20Sopenharmony_ci			 * some other bad page check should catch it later.
1918c2ecf20Sopenharmony_ci			 */
1928c2ecf20Sopenharmony_ci			page_mapcount_reset(page);
1938c2ecf20Sopenharmony_ci			page_ref_sub(page, mapcount);
1948c2ecf20Sopenharmony_ci		}
1958c2ecf20Sopenharmony_ci	}
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_ci	/* hugetlb pages do not participate in page cache accounting. */
1988c2ecf20Sopenharmony_ci	if (PageHuge(page))
1998c2ecf20Sopenharmony_ci		return;
2008c2ecf20Sopenharmony_ci
2018c2ecf20Sopenharmony_ci	nr = thp_nr_pages(page);
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	__mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
2048c2ecf20Sopenharmony_ci	if (PageSwapBacked(page)) {
2058c2ecf20Sopenharmony_ci		__mod_lruvec_page_state(page, NR_SHMEM, -nr);
2068c2ecf20Sopenharmony_ci		if (PageTransHuge(page))
2078c2ecf20Sopenharmony_ci			__dec_node_page_state(page, NR_SHMEM_THPS);
2088c2ecf20Sopenharmony_ci	} else if (PageTransHuge(page)) {
2098c2ecf20Sopenharmony_ci		__dec_node_page_state(page, NR_FILE_THPS);
2108c2ecf20Sopenharmony_ci		filemap_nr_thps_dec(mapping);
2118c2ecf20Sopenharmony_ci	}
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci	/*
2148c2ecf20Sopenharmony_ci	 * At this point page must be either written or cleaned by
2158c2ecf20Sopenharmony_ci	 * truncate.  Dirty page here signals a bug and loss of
2168c2ecf20Sopenharmony_ci	 * unwritten data.
2178c2ecf20Sopenharmony_ci	 *
2188c2ecf20Sopenharmony_ci	 * This fixes dirty accounting after removing the page entirely
2198c2ecf20Sopenharmony_ci	 * but leaves PageDirty set: it has no effect for truncated
2208c2ecf20Sopenharmony_ci	 * page and anyway will be cleared before returning page into
2218c2ecf20Sopenharmony_ci	 * buddy allocator.
2228c2ecf20Sopenharmony_ci	 */
2238c2ecf20Sopenharmony_ci	if (WARN_ON_ONCE(PageDirty(page)))
2248c2ecf20Sopenharmony_ci		account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
2258c2ecf20Sopenharmony_ci}
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_ci/*
2288c2ecf20Sopenharmony_ci * Delete a page from the page cache and free it. Caller has to make
2298c2ecf20Sopenharmony_ci * sure the page is locked and that nobody else uses it - or that usage
2308c2ecf20Sopenharmony_ci * is safe.  The caller must hold the i_pages lock.
2318c2ecf20Sopenharmony_ci */
2328c2ecf20Sopenharmony_civoid __delete_from_page_cache(struct page *page, void *shadow)
2338c2ecf20Sopenharmony_ci{
2348c2ecf20Sopenharmony_ci	struct address_space *mapping = page->mapping;
2358c2ecf20Sopenharmony_ci
2368c2ecf20Sopenharmony_ci	trace_mm_filemap_delete_from_page_cache(page);
2378c2ecf20Sopenharmony_ci
2388c2ecf20Sopenharmony_ci	unaccount_page_cache_page(mapping, page);
2398c2ecf20Sopenharmony_ci	page_cache_delete(mapping, page, shadow);
2408c2ecf20Sopenharmony_ci}
2418c2ecf20Sopenharmony_ci
2428c2ecf20Sopenharmony_cistatic void page_cache_free_page(struct address_space *mapping,
2438c2ecf20Sopenharmony_ci				struct page *page)
2448c2ecf20Sopenharmony_ci{
2458c2ecf20Sopenharmony_ci	void (*freepage)(struct page *);
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci	freepage = mapping->a_ops->freepage;
2488c2ecf20Sopenharmony_ci	if (freepage)
2498c2ecf20Sopenharmony_ci		freepage(page);
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci	if (PageTransHuge(page) && !PageHuge(page)) {
2528c2ecf20Sopenharmony_ci		page_ref_sub(page, thp_nr_pages(page));
2538c2ecf20Sopenharmony_ci		VM_BUG_ON_PAGE(page_count(page) <= 0, page);
2548c2ecf20Sopenharmony_ci	} else {
2558c2ecf20Sopenharmony_ci		put_page(page);
2568c2ecf20Sopenharmony_ci	}
2578c2ecf20Sopenharmony_ci}
2588c2ecf20Sopenharmony_ci
2598c2ecf20Sopenharmony_ci/**
2608c2ecf20Sopenharmony_ci * delete_from_page_cache - delete page from page cache
2618c2ecf20Sopenharmony_ci * @page: the page which the kernel is trying to remove from page cache
2628c2ecf20Sopenharmony_ci *
2638c2ecf20Sopenharmony_ci * This must be called only on pages that have been verified to be in the page
2648c2ecf20Sopenharmony_ci * cache and locked.  It will never put the page into the free list, the caller
2658c2ecf20Sopenharmony_ci * has a reference on the page.
2668c2ecf20Sopenharmony_ci */
2678c2ecf20Sopenharmony_civoid delete_from_page_cache(struct page *page)
2688c2ecf20Sopenharmony_ci{
2698c2ecf20Sopenharmony_ci	struct address_space *mapping = page_mapping(page);
2708c2ecf20Sopenharmony_ci	unsigned long flags;
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci	BUG_ON(!PageLocked(page));
2738c2ecf20Sopenharmony_ci	xa_lock_irqsave(&mapping->i_pages, flags);
2748c2ecf20Sopenharmony_ci	__delete_from_page_cache(page, NULL);
2758c2ecf20Sopenharmony_ci	xa_unlock_irqrestore(&mapping->i_pages, flags);
2768c2ecf20Sopenharmony_ci
2778c2ecf20Sopenharmony_ci	page_cache_free_page(mapping, page);
2788c2ecf20Sopenharmony_ci}
2798c2ecf20Sopenharmony_ciEXPORT_SYMBOL(delete_from_page_cache);
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_ci/*
2828c2ecf20Sopenharmony_ci * page_cache_delete_batch - delete several pages from page cache
2838c2ecf20Sopenharmony_ci * @mapping: the mapping to which pages belong
2848c2ecf20Sopenharmony_ci * @pvec: pagevec with pages to delete
2858c2ecf20Sopenharmony_ci *
2868c2ecf20Sopenharmony_ci * The function walks over mapping->i_pages and removes pages passed in @pvec
2878c2ecf20Sopenharmony_ci * from the mapping. The function expects @pvec to be sorted by page index
2888c2ecf20Sopenharmony_ci * and is optimised for it to be dense.
2898c2ecf20Sopenharmony_ci * It tolerates holes in @pvec (mapping entries at those indices are not
2908c2ecf20Sopenharmony_ci * modified). The function expects only THP head pages to be present in the
2918c2ecf20Sopenharmony_ci * @pvec.
2928c2ecf20Sopenharmony_ci *
2938c2ecf20Sopenharmony_ci * The function expects the i_pages lock to be held.
2948c2ecf20Sopenharmony_ci */
2958c2ecf20Sopenharmony_cistatic void page_cache_delete_batch(struct address_space *mapping,
2968c2ecf20Sopenharmony_ci			     struct pagevec *pvec)
2978c2ecf20Sopenharmony_ci{
2988c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
2998c2ecf20Sopenharmony_ci	int total_pages = 0;
3008c2ecf20Sopenharmony_ci	int i = 0;
3018c2ecf20Sopenharmony_ci	struct page *page;
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci	mapping_set_update(&xas, mapping);
3048c2ecf20Sopenharmony_ci	xas_for_each(&xas, page, ULONG_MAX) {
3058c2ecf20Sopenharmony_ci		if (i >= pagevec_count(pvec))
3068c2ecf20Sopenharmony_ci			break;
3078c2ecf20Sopenharmony_ci
3088c2ecf20Sopenharmony_ci		/* A swap/dax/shadow entry got inserted? Skip it. */
3098c2ecf20Sopenharmony_ci		if (xa_is_value(page))
3108c2ecf20Sopenharmony_ci			continue;
3118c2ecf20Sopenharmony_ci		/*
3128c2ecf20Sopenharmony_ci		 * A page got inserted in our range? Skip it. We have our
3138c2ecf20Sopenharmony_ci		 * pages locked so they are protected from being removed.
3148c2ecf20Sopenharmony_ci		 * If we see a page whose index is higher than ours, it
3158c2ecf20Sopenharmony_ci		 * means our page has been removed, which shouldn't be
3168c2ecf20Sopenharmony_ci		 * possible because we're holding the PageLock.
3178c2ecf20Sopenharmony_ci		 */
3188c2ecf20Sopenharmony_ci		if (page != pvec->pages[i]) {
3198c2ecf20Sopenharmony_ci			VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
3208c2ecf20Sopenharmony_ci					page);
3218c2ecf20Sopenharmony_ci			continue;
3228c2ecf20Sopenharmony_ci		}
3238c2ecf20Sopenharmony_ci
3248c2ecf20Sopenharmony_ci		WARN_ON_ONCE(!PageLocked(page));
3258c2ecf20Sopenharmony_ci
3268c2ecf20Sopenharmony_ci		if (page->index == xas.xa_index)
3278c2ecf20Sopenharmony_ci			page->mapping = NULL;
3288c2ecf20Sopenharmony_ci		/* Leave page->index set: truncation lookup relies on it */
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci		/*
3318c2ecf20Sopenharmony_ci		 * Move to the next page in the vector if this is a regular
3328c2ecf20Sopenharmony_ci		 * page or the index is of the last sub-page of this compound
3338c2ecf20Sopenharmony_ci		 * page.
3348c2ecf20Sopenharmony_ci		 */
3358c2ecf20Sopenharmony_ci		if (page->index + compound_nr(page) - 1 == xas.xa_index)
3368c2ecf20Sopenharmony_ci			i++;
3378c2ecf20Sopenharmony_ci		xas_store(&xas, NULL);
3388c2ecf20Sopenharmony_ci		total_pages++;
3398c2ecf20Sopenharmony_ci	}
3408c2ecf20Sopenharmony_ci	mapping->nrpages -= total_pages;
3418c2ecf20Sopenharmony_ci}
3428c2ecf20Sopenharmony_ci
3438c2ecf20Sopenharmony_civoid delete_from_page_cache_batch(struct address_space *mapping,
3448c2ecf20Sopenharmony_ci				  struct pagevec *pvec)
3458c2ecf20Sopenharmony_ci{
3468c2ecf20Sopenharmony_ci	int i;
3478c2ecf20Sopenharmony_ci	unsigned long flags;
3488c2ecf20Sopenharmony_ci
3498c2ecf20Sopenharmony_ci	if (!pagevec_count(pvec))
3508c2ecf20Sopenharmony_ci		return;
3518c2ecf20Sopenharmony_ci
3528c2ecf20Sopenharmony_ci	xa_lock_irqsave(&mapping->i_pages, flags);
3538c2ecf20Sopenharmony_ci	for (i = 0; i < pagevec_count(pvec); i++) {
3548c2ecf20Sopenharmony_ci		trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
3558c2ecf20Sopenharmony_ci
3568c2ecf20Sopenharmony_ci		unaccount_page_cache_page(mapping, pvec->pages[i]);
3578c2ecf20Sopenharmony_ci	}
3588c2ecf20Sopenharmony_ci	page_cache_delete_batch(mapping, pvec);
3598c2ecf20Sopenharmony_ci	xa_unlock_irqrestore(&mapping->i_pages, flags);
3608c2ecf20Sopenharmony_ci
3618c2ecf20Sopenharmony_ci	for (i = 0; i < pagevec_count(pvec); i++)
3628c2ecf20Sopenharmony_ci		page_cache_free_page(mapping, pvec->pages[i]);
3638c2ecf20Sopenharmony_ci}
3648c2ecf20Sopenharmony_ci
3658c2ecf20Sopenharmony_ciint filemap_check_errors(struct address_space *mapping)
3668c2ecf20Sopenharmony_ci{
3678c2ecf20Sopenharmony_ci	int ret = 0;
3688c2ecf20Sopenharmony_ci	/* Check for outstanding write errors */
3698c2ecf20Sopenharmony_ci	if (test_bit(AS_ENOSPC, &mapping->flags) &&
3708c2ecf20Sopenharmony_ci	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
3718c2ecf20Sopenharmony_ci		ret = -ENOSPC;
3728c2ecf20Sopenharmony_ci	if (test_bit(AS_EIO, &mapping->flags) &&
3738c2ecf20Sopenharmony_ci	    test_and_clear_bit(AS_EIO, &mapping->flags))
3748c2ecf20Sopenharmony_ci		ret = -EIO;
3758c2ecf20Sopenharmony_ci	return ret;
3768c2ecf20Sopenharmony_ci}
3778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_check_errors);
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_cistatic int filemap_check_and_keep_errors(struct address_space *mapping)
3808c2ecf20Sopenharmony_ci{
3818c2ecf20Sopenharmony_ci	/* Check for outstanding write errors */
3828c2ecf20Sopenharmony_ci	if (test_bit(AS_EIO, &mapping->flags))
3838c2ecf20Sopenharmony_ci		return -EIO;
3848c2ecf20Sopenharmony_ci	if (test_bit(AS_ENOSPC, &mapping->flags))
3858c2ecf20Sopenharmony_ci		return -ENOSPC;
3868c2ecf20Sopenharmony_ci	return 0;
3878c2ecf20Sopenharmony_ci}
3888c2ecf20Sopenharmony_ci
3898c2ecf20Sopenharmony_ci/**
3908c2ecf20Sopenharmony_ci * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
3918c2ecf20Sopenharmony_ci * @mapping:	address space structure to write
3928c2ecf20Sopenharmony_ci * @start:	offset in bytes where the range starts
3938c2ecf20Sopenharmony_ci * @end:	offset in bytes where the range ends (inclusive)
3948c2ecf20Sopenharmony_ci * @sync_mode:	enable synchronous operation
3958c2ecf20Sopenharmony_ci *
3968c2ecf20Sopenharmony_ci * Start writeback against all of a mapping's dirty pages that lie
3978c2ecf20Sopenharmony_ci * within the byte offsets <start, end> inclusive.
3988c2ecf20Sopenharmony_ci *
3998c2ecf20Sopenharmony_ci * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
4008c2ecf20Sopenharmony_ci * opposed to a regular memory cleansing writeback.  The difference between
4018c2ecf20Sopenharmony_ci * these two operations is that if a dirty page/buffer is encountered, it must
4028c2ecf20Sopenharmony_ci * be waited upon, and not just skipped over.
4038c2ecf20Sopenharmony_ci *
4048c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
4058c2ecf20Sopenharmony_ci */
4068c2ecf20Sopenharmony_ciint __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
4078c2ecf20Sopenharmony_ci				loff_t end, int sync_mode)
4088c2ecf20Sopenharmony_ci{
4098c2ecf20Sopenharmony_ci	int ret;
4108c2ecf20Sopenharmony_ci	struct writeback_control wbc = {
4118c2ecf20Sopenharmony_ci		.sync_mode = sync_mode,
4128c2ecf20Sopenharmony_ci		.nr_to_write = LONG_MAX,
4138c2ecf20Sopenharmony_ci		.range_start = start,
4148c2ecf20Sopenharmony_ci		.range_end = end,
4158c2ecf20Sopenharmony_ci	};
4168c2ecf20Sopenharmony_ci
4178c2ecf20Sopenharmony_ci	if (!mapping_can_writeback(mapping) ||
4188c2ecf20Sopenharmony_ci	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
4198c2ecf20Sopenharmony_ci		return 0;
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_ci	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
4228c2ecf20Sopenharmony_ci	ret = do_writepages(mapping, &wbc);
4238c2ecf20Sopenharmony_ci	wbc_detach_inode(&wbc);
4248c2ecf20Sopenharmony_ci	return ret;
4258c2ecf20Sopenharmony_ci}
4268c2ecf20Sopenharmony_ci
4278c2ecf20Sopenharmony_cistatic inline int __filemap_fdatawrite(struct address_space *mapping,
4288c2ecf20Sopenharmony_ci	int sync_mode)
4298c2ecf20Sopenharmony_ci{
4308c2ecf20Sopenharmony_ci	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
4318c2ecf20Sopenharmony_ci}
4328c2ecf20Sopenharmony_ci
4338c2ecf20Sopenharmony_ciint filemap_fdatawrite(struct address_space *mapping)
4348c2ecf20Sopenharmony_ci{
4358c2ecf20Sopenharmony_ci	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
4368c2ecf20Sopenharmony_ci}
4378c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_fdatawrite);
4388c2ecf20Sopenharmony_ci
4398c2ecf20Sopenharmony_ciint filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
4408c2ecf20Sopenharmony_ci				loff_t end)
4418c2ecf20Sopenharmony_ci{
4428c2ecf20Sopenharmony_ci	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
4438c2ecf20Sopenharmony_ci}
4448c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_fdatawrite_range);
4458c2ecf20Sopenharmony_ci
4468c2ecf20Sopenharmony_ci/**
4478c2ecf20Sopenharmony_ci * filemap_flush - mostly a non-blocking flush
4488c2ecf20Sopenharmony_ci * @mapping:	target address_space
4498c2ecf20Sopenharmony_ci *
4508c2ecf20Sopenharmony_ci * This is a mostly non-blocking flush.  Not suitable for data-integrity
4518c2ecf20Sopenharmony_ci * purposes - I/O may not be started against all dirty pages.
4528c2ecf20Sopenharmony_ci *
4538c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
4548c2ecf20Sopenharmony_ci */
4558c2ecf20Sopenharmony_ciint filemap_flush(struct address_space *mapping)
4568c2ecf20Sopenharmony_ci{
4578c2ecf20Sopenharmony_ci	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
4588c2ecf20Sopenharmony_ci}
4598c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_flush);
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_ci/**
4628c2ecf20Sopenharmony_ci * filemap_range_has_page - check if a page exists in range.
4638c2ecf20Sopenharmony_ci * @mapping:           address space within which to check
4648c2ecf20Sopenharmony_ci * @start_byte:        offset in bytes where the range starts
4658c2ecf20Sopenharmony_ci * @end_byte:          offset in bytes where the range ends (inclusive)
4668c2ecf20Sopenharmony_ci *
4678c2ecf20Sopenharmony_ci * Find at least one page in the range supplied, usually used to check if
4688c2ecf20Sopenharmony_ci * direct writing in this range will trigger a writeback.
4698c2ecf20Sopenharmony_ci *
4708c2ecf20Sopenharmony_ci * Return: %true if at least one page exists in the specified range,
4718c2ecf20Sopenharmony_ci * %false otherwise.
4728c2ecf20Sopenharmony_ci */
4738c2ecf20Sopenharmony_cibool filemap_range_has_page(struct address_space *mapping,
4748c2ecf20Sopenharmony_ci			   loff_t start_byte, loff_t end_byte)
4758c2ecf20Sopenharmony_ci{
4768c2ecf20Sopenharmony_ci	struct page *page;
4778c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
4788c2ecf20Sopenharmony_ci	pgoff_t max = end_byte >> PAGE_SHIFT;
4798c2ecf20Sopenharmony_ci
4808c2ecf20Sopenharmony_ci	if (end_byte < start_byte)
4818c2ecf20Sopenharmony_ci		return false;
4828c2ecf20Sopenharmony_ci
4838c2ecf20Sopenharmony_ci	rcu_read_lock();
4848c2ecf20Sopenharmony_ci	for (;;) {
4858c2ecf20Sopenharmony_ci		page = xas_find(&xas, max);
4868c2ecf20Sopenharmony_ci		if (xas_retry(&xas, page))
4878c2ecf20Sopenharmony_ci			continue;
4888c2ecf20Sopenharmony_ci		/* Shadow entries don't count */
4898c2ecf20Sopenharmony_ci		if (xa_is_value(page))
4908c2ecf20Sopenharmony_ci			continue;
4918c2ecf20Sopenharmony_ci		/*
4928c2ecf20Sopenharmony_ci		 * We don't need to try to pin this page; we're about to
4938c2ecf20Sopenharmony_ci		 * release the RCU lock anyway.  It is enough to know that
4948c2ecf20Sopenharmony_ci		 * there was a page here recently.
4958c2ecf20Sopenharmony_ci		 */
4968c2ecf20Sopenharmony_ci		break;
4978c2ecf20Sopenharmony_ci	}
4988c2ecf20Sopenharmony_ci	rcu_read_unlock();
4998c2ecf20Sopenharmony_ci
5008c2ecf20Sopenharmony_ci	return page != NULL;
5018c2ecf20Sopenharmony_ci}
5028c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_range_has_page);
5038c2ecf20Sopenharmony_ci
5048c2ecf20Sopenharmony_cistatic void __filemap_fdatawait_range(struct address_space *mapping,
5058c2ecf20Sopenharmony_ci				     loff_t start_byte, loff_t end_byte)
5068c2ecf20Sopenharmony_ci{
5078c2ecf20Sopenharmony_ci	pgoff_t index = start_byte >> PAGE_SHIFT;
5088c2ecf20Sopenharmony_ci	pgoff_t end = end_byte >> PAGE_SHIFT;
5098c2ecf20Sopenharmony_ci	struct pagevec pvec;
5108c2ecf20Sopenharmony_ci	int nr_pages;
5118c2ecf20Sopenharmony_ci
5128c2ecf20Sopenharmony_ci	if (end_byte < start_byte)
5138c2ecf20Sopenharmony_ci		return;
5148c2ecf20Sopenharmony_ci
5158c2ecf20Sopenharmony_ci	pagevec_init(&pvec);
5168c2ecf20Sopenharmony_ci	while (index <= end) {
5178c2ecf20Sopenharmony_ci		unsigned i;
5188c2ecf20Sopenharmony_ci
5198c2ecf20Sopenharmony_ci		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
5208c2ecf20Sopenharmony_ci				end, PAGECACHE_TAG_WRITEBACK);
5218c2ecf20Sopenharmony_ci		if (!nr_pages)
5228c2ecf20Sopenharmony_ci			break;
5238c2ecf20Sopenharmony_ci
5248c2ecf20Sopenharmony_ci		for (i = 0; i < nr_pages; i++) {
5258c2ecf20Sopenharmony_ci			struct page *page = pvec.pages[i];
5268c2ecf20Sopenharmony_ci
5278c2ecf20Sopenharmony_ci			wait_on_page_writeback(page);
5288c2ecf20Sopenharmony_ci			ClearPageError(page);
5298c2ecf20Sopenharmony_ci		}
5308c2ecf20Sopenharmony_ci		pagevec_release(&pvec);
5318c2ecf20Sopenharmony_ci		cond_resched();
5328c2ecf20Sopenharmony_ci	}
5338c2ecf20Sopenharmony_ci}
5348c2ecf20Sopenharmony_ci
5358c2ecf20Sopenharmony_ci/**
5368c2ecf20Sopenharmony_ci * filemap_fdatawait_range - wait for writeback to complete
5378c2ecf20Sopenharmony_ci * @mapping:		address space structure to wait for
5388c2ecf20Sopenharmony_ci * @start_byte:		offset in bytes where the range starts
5398c2ecf20Sopenharmony_ci * @end_byte:		offset in bytes where the range ends (inclusive)
5408c2ecf20Sopenharmony_ci *
5418c2ecf20Sopenharmony_ci * Walk the list of under-writeback pages of the given address space
5428c2ecf20Sopenharmony_ci * in the given range and wait for all of them.  Check error status of
5438c2ecf20Sopenharmony_ci * the address space and return it.
5448c2ecf20Sopenharmony_ci *
5458c2ecf20Sopenharmony_ci * Since the error status of the address space is cleared by this function,
5468c2ecf20Sopenharmony_ci * callers are responsible for checking the return value and handling and/or
5478c2ecf20Sopenharmony_ci * reporting the error.
5488c2ecf20Sopenharmony_ci *
5498c2ecf20Sopenharmony_ci * Return: error status of the address space.
5508c2ecf20Sopenharmony_ci */
5518c2ecf20Sopenharmony_ciint filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
5528c2ecf20Sopenharmony_ci			    loff_t end_byte)
5538c2ecf20Sopenharmony_ci{
5548c2ecf20Sopenharmony_ci	__filemap_fdatawait_range(mapping, start_byte, end_byte);
5558c2ecf20Sopenharmony_ci	return filemap_check_errors(mapping);
5568c2ecf20Sopenharmony_ci}
5578c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_fdatawait_range);
5588c2ecf20Sopenharmony_ci
5598c2ecf20Sopenharmony_ci/**
5608c2ecf20Sopenharmony_ci * filemap_fdatawait_range_keep_errors - wait for writeback to complete
5618c2ecf20Sopenharmony_ci * @mapping:		address space structure to wait for
5628c2ecf20Sopenharmony_ci * @start_byte:		offset in bytes where the range starts
5638c2ecf20Sopenharmony_ci * @end_byte:		offset in bytes where the range ends (inclusive)
5648c2ecf20Sopenharmony_ci *
5658c2ecf20Sopenharmony_ci * Walk the list of under-writeback pages of the given address space in the
5668c2ecf20Sopenharmony_ci * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
5678c2ecf20Sopenharmony_ci * this function does not clear error status of the address space.
5688c2ecf20Sopenharmony_ci *
5698c2ecf20Sopenharmony_ci * Use this function if callers don't handle errors themselves.  Expected
5708c2ecf20Sopenharmony_ci * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
5718c2ecf20Sopenharmony_ci * fsfreeze(8)
5728c2ecf20Sopenharmony_ci */
5738c2ecf20Sopenharmony_ciint filemap_fdatawait_range_keep_errors(struct address_space *mapping,
5748c2ecf20Sopenharmony_ci		loff_t start_byte, loff_t end_byte)
5758c2ecf20Sopenharmony_ci{
5768c2ecf20Sopenharmony_ci	__filemap_fdatawait_range(mapping, start_byte, end_byte);
5778c2ecf20Sopenharmony_ci	return filemap_check_and_keep_errors(mapping);
5788c2ecf20Sopenharmony_ci}
5798c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
5808c2ecf20Sopenharmony_ci
5818c2ecf20Sopenharmony_ci/**
5828c2ecf20Sopenharmony_ci * file_fdatawait_range - wait for writeback to complete
5838c2ecf20Sopenharmony_ci * @file:		file pointing to address space structure to wait for
5848c2ecf20Sopenharmony_ci * @start_byte:		offset in bytes where the range starts
5858c2ecf20Sopenharmony_ci * @end_byte:		offset in bytes where the range ends (inclusive)
5868c2ecf20Sopenharmony_ci *
5878c2ecf20Sopenharmony_ci * Walk the list of under-writeback pages of the address space that file
5888c2ecf20Sopenharmony_ci * refers to, in the given range and wait for all of them.  Check error
5898c2ecf20Sopenharmony_ci * status of the address space vs. the file->f_wb_err cursor and return it.
5908c2ecf20Sopenharmony_ci *
5918c2ecf20Sopenharmony_ci * Since the error status of the file is advanced by this function,
5928c2ecf20Sopenharmony_ci * callers are responsible for checking the return value and handling and/or
5938c2ecf20Sopenharmony_ci * reporting the error.
5948c2ecf20Sopenharmony_ci *
5958c2ecf20Sopenharmony_ci * Return: error status of the address space vs. the file->f_wb_err cursor.
5968c2ecf20Sopenharmony_ci */
5978c2ecf20Sopenharmony_ciint file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
5988c2ecf20Sopenharmony_ci{
5998c2ecf20Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
6008c2ecf20Sopenharmony_ci
6018c2ecf20Sopenharmony_ci	__filemap_fdatawait_range(mapping, start_byte, end_byte);
6028c2ecf20Sopenharmony_ci	return file_check_and_advance_wb_err(file);
6038c2ecf20Sopenharmony_ci}
6048c2ecf20Sopenharmony_ciEXPORT_SYMBOL(file_fdatawait_range);
6058c2ecf20Sopenharmony_ci
6068c2ecf20Sopenharmony_ci/**
6078c2ecf20Sopenharmony_ci * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
6088c2ecf20Sopenharmony_ci * @mapping: address space structure to wait for
6098c2ecf20Sopenharmony_ci *
6108c2ecf20Sopenharmony_ci * Walk the list of under-writeback pages of the given address space
6118c2ecf20Sopenharmony_ci * and wait for all of them.  Unlike filemap_fdatawait(), this function
6128c2ecf20Sopenharmony_ci * does not clear error status of the address space.
6138c2ecf20Sopenharmony_ci *
6148c2ecf20Sopenharmony_ci * Use this function if callers don't handle errors themselves.  Expected
6158c2ecf20Sopenharmony_ci * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
6168c2ecf20Sopenharmony_ci * fsfreeze(8)
6178c2ecf20Sopenharmony_ci *
6188c2ecf20Sopenharmony_ci * Return: error status of the address space.
6198c2ecf20Sopenharmony_ci */
6208c2ecf20Sopenharmony_ciint filemap_fdatawait_keep_errors(struct address_space *mapping)
6218c2ecf20Sopenharmony_ci{
6228c2ecf20Sopenharmony_ci	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
6238c2ecf20Sopenharmony_ci	return filemap_check_and_keep_errors(mapping);
6248c2ecf20Sopenharmony_ci}
6258c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_fdatawait_keep_errors);
6268c2ecf20Sopenharmony_ci
6278c2ecf20Sopenharmony_ci/* Returns true if writeback might be needed or already in progress. */
6288c2ecf20Sopenharmony_cistatic bool mapping_needs_writeback(struct address_space *mapping)
6298c2ecf20Sopenharmony_ci{
6308c2ecf20Sopenharmony_ci	if (dax_mapping(mapping))
6318c2ecf20Sopenharmony_ci		return mapping->nrexceptional;
6328c2ecf20Sopenharmony_ci
6338c2ecf20Sopenharmony_ci	return mapping->nrpages;
6348c2ecf20Sopenharmony_ci}
6358c2ecf20Sopenharmony_ci
6368c2ecf20Sopenharmony_ci/**
6378c2ecf20Sopenharmony_ci * filemap_write_and_wait_range - write out & wait on a file range
6388c2ecf20Sopenharmony_ci * @mapping:	the address_space for the pages
6398c2ecf20Sopenharmony_ci * @lstart:	offset in bytes where the range starts
6408c2ecf20Sopenharmony_ci * @lend:	offset in bytes where the range ends (inclusive)
6418c2ecf20Sopenharmony_ci *
6428c2ecf20Sopenharmony_ci * Write out and wait upon file offsets lstart->lend, inclusive.
6438c2ecf20Sopenharmony_ci *
6448c2ecf20Sopenharmony_ci * Note that @lend is inclusive (describes the last byte to be written) so
6458c2ecf20Sopenharmony_ci * that this function can be used to write to the very end-of-file (end = -1).
6468c2ecf20Sopenharmony_ci *
6478c2ecf20Sopenharmony_ci * Return: error status of the address space.
6488c2ecf20Sopenharmony_ci */
6498c2ecf20Sopenharmony_ciint filemap_write_and_wait_range(struct address_space *mapping,
6508c2ecf20Sopenharmony_ci				 loff_t lstart, loff_t lend)
6518c2ecf20Sopenharmony_ci{
6528c2ecf20Sopenharmony_ci	int err = 0;
6538c2ecf20Sopenharmony_ci
6548c2ecf20Sopenharmony_ci	if (mapping_needs_writeback(mapping)) {
6558c2ecf20Sopenharmony_ci		err = __filemap_fdatawrite_range(mapping, lstart, lend,
6568c2ecf20Sopenharmony_ci						 WB_SYNC_ALL);
6578c2ecf20Sopenharmony_ci		/*
6588c2ecf20Sopenharmony_ci		 * Even if the above returned error, the pages may be
6598c2ecf20Sopenharmony_ci		 * written partially (e.g. -ENOSPC), so we wait for it.
6608c2ecf20Sopenharmony_ci		 * But the -EIO is special case, it may indicate the worst
6618c2ecf20Sopenharmony_ci		 * thing (e.g. bug) happened, so we avoid waiting for it.
6628c2ecf20Sopenharmony_ci		 */
6638c2ecf20Sopenharmony_ci		if (err != -EIO) {
6648c2ecf20Sopenharmony_ci			int err2 = filemap_fdatawait_range(mapping,
6658c2ecf20Sopenharmony_ci						lstart, lend);
6668c2ecf20Sopenharmony_ci			if (!err)
6678c2ecf20Sopenharmony_ci				err = err2;
6688c2ecf20Sopenharmony_ci		} else {
6698c2ecf20Sopenharmony_ci			/* Clear any previously stored errors */
6708c2ecf20Sopenharmony_ci			filemap_check_errors(mapping);
6718c2ecf20Sopenharmony_ci		}
6728c2ecf20Sopenharmony_ci	} else {
6738c2ecf20Sopenharmony_ci		err = filemap_check_errors(mapping);
6748c2ecf20Sopenharmony_ci	}
6758c2ecf20Sopenharmony_ci	return err;
6768c2ecf20Sopenharmony_ci}
6778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_write_and_wait_range);
6788c2ecf20Sopenharmony_ci
6798c2ecf20Sopenharmony_civoid __filemap_set_wb_err(struct address_space *mapping, int err)
6808c2ecf20Sopenharmony_ci{
6818c2ecf20Sopenharmony_ci	errseq_t eseq = errseq_set(&mapping->wb_err, err);
6828c2ecf20Sopenharmony_ci
6838c2ecf20Sopenharmony_ci	trace_filemap_set_wb_err(mapping, eseq);
6848c2ecf20Sopenharmony_ci}
6858c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__filemap_set_wb_err);
6868c2ecf20Sopenharmony_ci
6878c2ecf20Sopenharmony_ci/**
6888c2ecf20Sopenharmony_ci * file_check_and_advance_wb_err - report wb error (if any) that was previously
6898c2ecf20Sopenharmony_ci * 				   and advance wb_err to current one
6908c2ecf20Sopenharmony_ci * @file: struct file on which the error is being reported
6918c2ecf20Sopenharmony_ci *
6928c2ecf20Sopenharmony_ci * When userland calls fsync (or something like nfsd does the equivalent), we
6938c2ecf20Sopenharmony_ci * want to report any writeback errors that occurred since the last fsync (or
6948c2ecf20Sopenharmony_ci * since the file was opened if there haven't been any).
6958c2ecf20Sopenharmony_ci *
6968c2ecf20Sopenharmony_ci * Grab the wb_err from the mapping. If it matches what we have in the file,
6978c2ecf20Sopenharmony_ci * then just quickly return 0. The file is all caught up.
6988c2ecf20Sopenharmony_ci *
6998c2ecf20Sopenharmony_ci * If it doesn't match, then take the mapping value, set the "seen" flag in
7008c2ecf20Sopenharmony_ci * it and try to swap it into place. If it works, or another task beat us
7018c2ecf20Sopenharmony_ci * to it with the new value, then update the f_wb_err and return the error
7028c2ecf20Sopenharmony_ci * portion. The error at this point must be reported via proper channels
7038c2ecf20Sopenharmony_ci * (a'la fsync, or NFS COMMIT operation, etc.).
7048c2ecf20Sopenharmony_ci *
7058c2ecf20Sopenharmony_ci * While we handle mapping->wb_err with atomic operations, the f_wb_err
7068c2ecf20Sopenharmony_ci * value is protected by the f_lock since we must ensure that it reflects
7078c2ecf20Sopenharmony_ci * the latest value swapped in for this file descriptor.
7088c2ecf20Sopenharmony_ci *
7098c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
7108c2ecf20Sopenharmony_ci */
7118c2ecf20Sopenharmony_ciint file_check_and_advance_wb_err(struct file *file)
7128c2ecf20Sopenharmony_ci{
7138c2ecf20Sopenharmony_ci	int err = 0;
7148c2ecf20Sopenharmony_ci	errseq_t old = READ_ONCE(file->f_wb_err);
7158c2ecf20Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
7168c2ecf20Sopenharmony_ci
7178c2ecf20Sopenharmony_ci	/* Locklessly handle the common case where nothing has changed */
7188c2ecf20Sopenharmony_ci	if (errseq_check(&mapping->wb_err, old)) {
7198c2ecf20Sopenharmony_ci		/* Something changed, must use slow path */
7208c2ecf20Sopenharmony_ci		spin_lock(&file->f_lock);
7218c2ecf20Sopenharmony_ci		old = file->f_wb_err;
7228c2ecf20Sopenharmony_ci		err = errseq_check_and_advance(&mapping->wb_err,
7238c2ecf20Sopenharmony_ci						&file->f_wb_err);
7248c2ecf20Sopenharmony_ci		trace_file_check_and_advance_wb_err(file, old);
7258c2ecf20Sopenharmony_ci		spin_unlock(&file->f_lock);
7268c2ecf20Sopenharmony_ci	}
7278c2ecf20Sopenharmony_ci
7288c2ecf20Sopenharmony_ci	/*
7298c2ecf20Sopenharmony_ci	 * We're mostly using this function as a drop in replacement for
7308c2ecf20Sopenharmony_ci	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
7318c2ecf20Sopenharmony_ci	 * that the legacy code would have had on these flags.
7328c2ecf20Sopenharmony_ci	 */
7338c2ecf20Sopenharmony_ci	clear_bit(AS_EIO, &mapping->flags);
7348c2ecf20Sopenharmony_ci	clear_bit(AS_ENOSPC, &mapping->flags);
7358c2ecf20Sopenharmony_ci	return err;
7368c2ecf20Sopenharmony_ci}
7378c2ecf20Sopenharmony_ciEXPORT_SYMBOL(file_check_and_advance_wb_err);
7388c2ecf20Sopenharmony_ci
7398c2ecf20Sopenharmony_ci/**
7408c2ecf20Sopenharmony_ci * file_write_and_wait_range - write out & wait on a file range
7418c2ecf20Sopenharmony_ci * @file:	file pointing to address_space with pages
7428c2ecf20Sopenharmony_ci * @lstart:	offset in bytes where the range starts
7438c2ecf20Sopenharmony_ci * @lend:	offset in bytes where the range ends (inclusive)
7448c2ecf20Sopenharmony_ci *
7458c2ecf20Sopenharmony_ci * Write out and wait upon file offsets lstart->lend, inclusive.
7468c2ecf20Sopenharmony_ci *
7478c2ecf20Sopenharmony_ci * Note that @lend is inclusive (describes the last byte to be written) so
7488c2ecf20Sopenharmony_ci * that this function can be used to write to the very end-of-file (end = -1).
7498c2ecf20Sopenharmony_ci *
7508c2ecf20Sopenharmony_ci * After writing out and waiting on the data, we check and advance the
7518c2ecf20Sopenharmony_ci * f_wb_err cursor to the latest value, and return any errors detected there.
7528c2ecf20Sopenharmony_ci *
7538c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
7548c2ecf20Sopenharmony_ci */
7558c2ecf20Sopenharmony_ciint file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
7568c2ecf20Sopenharmony_ci{
7578c2ecf20Sopenharmony_ci	int err = 0, err2;
7588c2ecf20Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
7598c2ecf20Sopenharmony_ci
7608c2ecf20Sopenharmony_ci	if (mapping_needs_writeback(mapping)) {
7618c2ecf20Sopenharmony_ci		err = __filemap_fdatawrite_range(mapping, lstart, lend,
7628c2ecf20Sopenharmony_ci						 WB_SYNC_ALL);
7638c2ecf20Sopenharmony_ci		/* See comment of filemap_write_and_wait() */
7648c2ecf20Sopenharmony_ci		if (err != -EIO)
7658c2ecf20Sopenharmony_ci			__filemap_fdatawait_range(mapping, lstart, lend);
7668c2ecf20Sopenharmony_ci	}
7678c2ecf20Sopenharmony_ci	err2 = file_check_and_advance_wb_err(file);
7688c2ecf20Sopenharmony_ci	if (!err)
7698c2ecf20Sopenharmony_ci		err = err2;
7708c2ecf20Sopenharmony_ci	return err;
7718c2ecf20Sopenharmony_ci}
7728c2ecf20Sopenharmony_ciEXPORT_SYMBOL(file_write_and_wait_range);
7738c2ecf20Sopenharmony_ci
7748c2ecf20Sopenharmony_ci/**
7758c2ecf20Sopenharmony_ci * replace_page_cache_page - replace a pagecache page with a new one
7768c2ecf20Sopenharmony_ci * @old:	page to be replaced
7778c2ecf20Sopenharmony_ci * @new:	page to replace with
7788c2ecf20Sopenharmony_ci * @gfp_mask:	allocation mode
7798c2ecf20Sopenharmony_ci *
7808c2ecf20Sopenharmony_ci * This function replaces a page in the pagecache with a new one.  On
7818c2ecf20Sopenharmony_ci * success it acquires the pagecache reference for the new page and
7828c2ecf20Sopenharmony_ci * drops it for the old page.  Both the old and new pages must be
7838c2ecf20Sopenharmony_ci * locked.  This function does not add the new page to the LRU, the
7848c2ecf20Sopenharmony_ci * caller must do that.
7858c2ecf20Sopenharmony_ci *
7868c2ecf20Sopenharmony_ci * The remove + add is atomic.  This function cannot fail.
7878c2ecf20Sopenharmony_ci *
7888c2ecf20Sopenharmony_ci * Return: %0
7898c2ecf20Sopenharmony_ci */
7908c2ecf20Sopenharmony_ciint replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
7918c2ecf20Sopenharmony_ci{
7928c2ecf20Sopenharmony_ci	struct address_space *mapping = old->mapping;
7938c2ecf20Sopenharmony_ci	void (*freepage)(struct page *) = mapping->a_ops->freepage;
7948c2ecf20Sopenharmony_ci	pgoff_t offset = old->index;
7958c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, offset);
7968c2ecf20Sopenharmony_ci	unsigned long flags;
7978c2ecf20Sopenharmony_ci
7988c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!PageLocked(old), old);
7998c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!PageLocked(new), new);
8008c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(new->mapping, new);
8018c2ecf20Sopenharmony_ci
8028c2ecf20Sopenharmony_ci	get_page(new);
8038c2ecf20Sopenharmony_ci	new->mapping = mapping;
8048c2ecf20Sopenharmony_ci	new->index = offset;
8058c2ecf20Sopenharmony_ci
8068c2ecf20Sopenharmony_ci	mem_cgroup_migrate(old, new);
8078c2ecf20Sopenharmony_ci
8088c2ecf20Sopenharmony_ci	xas_lock_irqsave(&xas, flags);
8098c2ecf20Sopenharmony_ci	xas_store(&xas, new);
8108c2ecf20Sopenharmony_ci
8118c2ecf20Sopenharmony_ci	old->mapping = NULL;
8128c2ecf20Sopenharmony_ci	/* hugetlb pages do not participate in page cache accounting. */
8138c2ecf20Sopenharmony_ci	if (!PageHuge(old))
8148c2ecf20Sopenharmony_ci		__dec_lruvec_page_state(old, NR_FILE_PAGES);
8158c2ecf20Sopenharmony_ci	if (!PageHuge(new))
8168c2ecf20Sopenharmony_ci		__inc_lruvec_page_state(new, NR_FILE_PAGES);
8178c2ecf20Sopenharmony_ci	if (PageSwapBacked(old))
8188c2ecf20Sopenharmony_ci		__dec_lruvec_page_state(old, NR_SHMEM);
8198c2ecf20Sopenharmony_ci	if (PageSwapBacked(new))
8208c2ecf20Sopenharmony_ci		__inc_lruvec_page_state(new, NR_SHMEM);
8218c2ecf20Sopenharmony_ci	xas_unlock_irqrestore(&xas, flags);
8228c2ecf20Sopenharmony_ci	if (freepage)
8238c2ecf20Sopenharmony_ci		freepage(old);
8248c2ecf20Sopenharmony_ci	put_page(old);
8258c2ecf20Sopenharmony_ci
8268c2ecf20Sopenharmony_ci	return 0;
8278c2ecf20Sopenharmony_ci}
8288c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(replace_page_cache_page);
8298c2ecf20Sopenharmony_ci
8308c2ecf20Sopenharmony_cinoinline int __add_to_page_cache_locked(struct page *page,
8318c2ecf20Sopenharmony_ci					struct address_space *mapping,
8328c2ecf20Sopenharmony_ci					pgoff_t offset, gfp_t gfp,
8338c2ecf20Sopenharmony_ci					void **shadowp)
8348c2ecf20Sopenharmony_ci{
8358c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, offset);
8368c2ecf20Sopenharmony_ci	int huge = PageHuge(page);
8378c2ecf20Sopenharmony_ci	int error;
8388c2ecf20Sopenharmony_ci	bool charged = false;
8398c2ecf20Sopenharmony_ci
8408c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!PageLocked(page), page);
8418c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(PageSwapBacked(page), page);
8428c2ecf20Sopenharmony_ci	mapping_set_update(&xas, mapping);
8438c2ecf20Sopenharmony_ci
8448c2ecf20Sopenharmony_ci	get_page(page);
8458c2ecf20Sopenharmony_ci	page->mapping = mapping;
8468c2ecf20Sopenharmony_ci	page->index = offset;
8478c2ecf20Sopenharmony_ci
8488c2ecf20Sopenharmony_ci	if (!huge) {
8498c2ecf20Sopenharmony_ci		error = mem_cgroup_charge(page, current->mm, gfp);
8508c2ecf20Sopenharmony_ci		if (error)
8518c2ecf20Sopenharmony_ci			goto error;
8528c2ecf20Sopenharmony_ci		charged = true;
8538c2ecf20Sopenharmony_ci	}
8548c2ecf20Sopenharmony_ci
8558c2ecf20Sopenharmony_ci	gfp &= GFP_RECLAIM_MASK;
8568c2ecf20Sopenharmony_ci
8578c2ecf20Sopenharmony_ci	do {
8588c2ecf20Sopenharmony_ci		unsigned int order = xa_get_order(xas.xa, xas.xa_index);
8598c2ecf20Sopenharmony_ci		void *entry, *old = NULL;
8608c2ecf20Sopenharmony_ci
8618c2ecf20Sopenharmony_ci		if (order > thp_order(page))
8628c2ecf20Sopenharmony_ci			xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
8638c2ecf20Sopenharmony_ci					order, gfp);
8648c2ecf20Sopenharmony_ci		xas_lock_irq(&xas);
8658c2ecf20Sopenharmony_ci		xas_for_each_conflict(&xas, entry) {
8668c2ecf20Sopenharmony_ci			old = entry;
8678c2ecf20Sopenharmony_ci			if (!xa_is_value(entry)) {
8688c2ecf20Sopenharmony_ci				xas_set_err(&xas, -EEXIST);
8698c2ecf20Sopenharmony_ci				goto unlock;
8708c2ecf20Sopenharmony_ci			}
8718c2ecf20Sopenharmony_ci		}
8728c2ecf20Sopenharmony_ci
8738c2ecf20Sopenharmony_ci		if (old) {
8748c2ecf20Sopenharmony_ci			if (shadowp)
8758c2ecf20Sopenharmony_ci				*shadowp = old;
8768c2ecf20Sopenharmony_ci			/* entry may have been split before we acquired lock */
8778c2ecf20Sopenharmony_ci			order = xa_get_order(xas.xa, xas.xa_index);
8788c2ecf20Sopenharmony_ci			if (order > thp_order(page)) {
8798c2ecf20Sopenharmony_ci				xas_split(&xas, old, order);
8808c2ecf20Sopenharmony_ci				xas_reset(&xas);
8818c2ecf20Sopenharmony_ci			}
8828c2ecf20Sopenharmony_ci		}
8838c2ecf20Sopenharmony_ci
8848c2ecf20Sopenharmony_ci		xas_store(&xas, page);
8858c2ecf20Sopenharmony_ci		if (xas_error(&xas))
8868c2ecf20Sopenharmony_ci			goto unlock;
8878c2ecf20Sopenharmony_ci
8888c2ecf20Sopenharmony_ci		if (old)
8898c2ecf20Sopenharmony_ci			mapping->nrexceptional--;
8908c2ecf20Sopenharmony_ci		mapping->nrpages++;
8918c2ecf20Sopenharmony_ci
8928c2ecf20Sopenharmony_ci		/* hugetlb pages do not participate in page cache accounting */
8938c2ecf20Sopenharmony_ci		if (!huge)
8948c2ecf20Sopenharmony_ci			__inc_lruvec_page_state(page, NR_FILE_PAGES);
8958c2ecf20Sopenharmony_ciunlock:
8968c2ecf20Sopenharmony_ci		xas_unlock_irq(&xas);
8978c2ecf20Sopenharmony_ci	} while (xas_nomem(&xas, gfp));
8988c2ecf20Sopenharmony_ci
8998c2ecf20Sopenharmony_ci	if (xas_error(&xas)) {
9008c2ecf20Sopenharmony_ci		error = xas_error(&xas);
9018c2ecf20Sopenharmony_ci		if (charged)
9028c2ecf20Sopenharmony_ci			mem_cgroup_uncharge(page);
9038c2ecf20Sopenharmony_ci		goto error;
9048c2ecf20Sopenharmony_ci	}
9058c2ecf20Sopenharmony_ci
9068c2ecf20Sopenharmony_ci	trace_mm_filemap_add_to_page_cache(page);
9078c2ecf20Sopenharmony_ci	return 0;
9088c2ecf20Sopenharmony_cierror:
9098c2ecf20Sopenharmony_ci	page->mapping = NULL;
9108c2ecf20Sopenharmony_ci	/* Leave page->index set: truncation relies upon it */
9118c2ecf20Sopenharmony_ci	put_page(page);
9128c2ecf20Sopenharmony_ci	return error;
9138c2ecf20Sopenharmony_ci}
9148c2ecf20Sopenharmony_ciALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
9158c2ecf20Sopenharmony_ci
9168c2ecf20Sopenharmony_ci/**
9178c2ecf20Sopenharmony_ci * add_to_page_cache_locked - add a locked page to the pagecache
9188c2ecf20Sopenharmony_ci * @page:	page to add
9198c2ecf20Sopenharmony_ci * @mapping:	the page's address_space
9208c2ecf20Sopenharmony_ci * @offset:	page index
9218c2ecf20Sopenharmony_ci * @gfp_mask:	page allocation mode
9228c2ecf20Sopenharmony_ci *
9238c2ecf20Sopenharmony_ci * This function is used to add a page to the pagecache. It must be locked.
9248c2ecf20Sopenharmony_ci * This function does not add the page to the LRU.  The caller must do that.
9258c2ecf20Sopenharmony_ci *
9268c2ecf20Sopenharmony_ci * Return: %0 on success, negative error code otherwise.
9278c2ecf20Sopenharmony_ci */
9288c2ecf20Sopenharmony_ciint add_to_page_cache_locked(struct page *page, struct address_space *mapping,
9298c2ecf20Sopenharmony_ci		pgoff_t offset, gfp_t gfp_mask)
9308c2ecf20Sopenharmony_ci{
9318c2ecf20Sopenharmony_ci	return __add_to_page_cache_locked(page, mapping, offset,
9328c2ecf20Sopenharmony_ci					  gfp_mask, NULL);
9338c2ecf20Sopenharmony_ci}
9348c2ecf20Sopenharmony_ciEXPORT_SYMBOL(add_to_page_cache_locked);
9358c2ecf20Sopenharmony_ci
9368c2ecf20Sopenharmony_ciint add_to_page_cache_lru(struct page *page, struct address_space *mapping,
9378c2ecf20Sopenharmony_ci				pgoff_t offset, gfp_t gfp_mask)
9388c2ecf20Sopenharmony_ci{
9398c2ecf20Sopenharmony_ci	void *shadow = NULL;
9408c2ecf20Sopenharmony_ci	int ret;
9418c2ecf20Sopenharmony_ci
9428c2ecf20Sopenharmony_ci	__SetPageLocked(page);
9438c2ecf20Sopenharmony_ci	ret = __add_to_page_cache_locked(page, mapping, offset,
9448c2ecf20Sopenharmony_ci					 gfp_mask, &shadow);
9458c2ecf20Sopenharmony_ci	if (unlikely(ret))
9468c2ecf20Sopenharmony_ci		__ClearPageLocked(page);
9478c2ecf20Sopenharmony_ci	else {
9488c2ecf20Sopenharmony_ci		/*
9498c2ecf20Sopenharmony_ci		 * The page might have been evicted from cache only
9508c2ecf20Sopenharmony_ci		 * recently, in which case it should be activated like
9518c2ecf20Sopenharmony_ci		 * any other repeatedly accessed page.
9528c2ecf20Sopenharmony_ci		 * The exception is pages getting rewritten; evicting other
9538c2ecf20Sopenharmony_ci		 * data from the working set, only to cache data that will
9548c2ecf20Sopenharmony_ci		 * get overwritten with something else, is a waste of memory.
9558c2ecf20Sopenharmony_ci		 */
9568c2ecf20Sopenharmony_ci		WARN_ON_ONCE(PageActive(page));
9578c2ecf20Sopenharmony_ci		if (!(gfp_mask & __GFP_WRITE) && shadow)
9588c2ecf20Sopenharmony_ci			workingset_refault(page, shadow);
9598c2ecf20Sopenharmony_ci		lru_cache_add(page);
9608c2ecf20Sopenharmony_ci	}
9618c2ecf20Sopenharmony_ci	return ret;
9628c2ecf20Sopenharmony_ci}
9638c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(add_to_page_cache_lru);
9648c2ecf20Sopenharmony_ci
9658c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA
9668c2ecf20Sopenharmony_cistruct page *__page_cache_alloc(gfp_t gfp)
9678c2ecf20Sopenharmony_ci{
9688c2ecf20Sopenharmony_ci	int n;
9698c2ecf20Sopenharmony_ci	struct page *page;
9708c2ecf20Sopenharmony_ci
9718c2ecf20Sopenharmony_ci	if (cpuset_do_page_mem_spread()) {
9728c2ecf20Sopenharmony_ci		unsigned int cpuset_mems_cookie;
9738c2ecf20Sopenharmony_ci		do {
9748c2ecf20Sopenharmony_ci			cpuset_mems_cookie = read_mems_allowed_begin();
9758c2ecf20Sopenharmony_ci			n = cpuset_mem_spread_node();
9768c2ecf20Sopenharmony_ci			page = __alloc_pages_node(n, gfp, 0);
9778c2ecf20Sopenharmony_ci		} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
9788c2ecf20Sopenharmony_ci
9798c2ecf20Sopenharmony_ci		return page;
9808c2ecf20Sopenharmony_ci	}
9818c2ecf20Sopenharmony_ci	return alloc_pages(gfp, 0);
9828c2ecf20Sopenharmony_ci}
9838c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__page_cache_alloc);
9848c2ecf20Sopenharmony_ci#endif
9858c2ecf20Sopenharmony_ci
9868c2ecf20Sopenharmony_ci/*
9878c2ecf20Sopenharmony_ci * In order to wait for pages to become available there must be
9888c2ecf20Sopenharmony_ci * waitqueues associated with pages. By using a hash table of
9898c2ecf20Sopenharmony_ci * waitqueues where the bucket discipline is to maintain all
9908c2ecf20Sopenharmony_ci * waiters on the same queue and wake all when any of the pages
9918c2ecf20Sopenharmony_ci * become available, and for the woken contexts to check to be
9928c2ecf20Sopenharmony_ci * sure the appropriate page became available, this saves space
9938c2ecf20Sopenharmony_ci * at a cost of "thundering herd" phenomena during rare hash
9948c2ecf20Sopenharmony_ci * collisions.
9958c2ecf20Sopenharmony_ci */
9968c2ecf20Sopenharmony_ci#define PAGE_WAIT_TABLE_BITS 8
9978c2ecf20Sopenharmony_ci#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
9988c2ecf20Sopenharmony_cistatic wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
9998c2ecf20Sopenharmony_ci
10008c2ecf20Sopenharmony_cistatic wait_queue_head_t *page_waitqueue(struct page *page)
10018c2ecf20Sopenharmony_ci{
10028c2ecf20Sopenharmony_ci	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
10038c2ecf20Sopenharmony_ci}
10048c2ecf20Sopenharmony_ci
10058c2ecf20Sopenharmony_civoid __init pagecache_init(void)
10068c2ecf20Sopenharmony_ci{
10078c2ecf20Sopenharmony_ci	int i;
10088c2ecf20Sopenharmony_ci
10098c2ecf20Sopenharmony_ci	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
10108c2ecf20Sopenharmony_ci		init_waitqueue_head(&page_wait_table[i]);
10118c2ecf20Sopenharmony_ci
10128c2ecf20Sopenharmony_ci	page_writeback_init();
10138c2ecf20Sopenharmony_ci}
10148c2ecf20Sopenharmony_ci
10158c2ecf20Sopenharmony_ci/*
10168c2ecf20Sopenharmony_ci * The page wait code treats the "wait->flags" somewhat unusually, because
10178c2ecf20Sopenharmony_ci * we have multiple different kinds of waits, not just the usual "exclusive"
10188c2ecf20Sopenharmony_ci * one.
10198c2ecf20Sopenharmony_ci *
10208c2ecf20Sopenharmony_ci * We have:
10218c2ecf20Sopenharmony_ci *
10228c2ecf20Sopenharmony_ci *  (a) no special bits set:
10238c2ecf20Sopenharmony_ci *
10248c2ecf20Sopenharmony_ci *	We're just waiting for the bit to be released, and when a waker
10258c2ecf20Sopenharmony_ci *	calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
10268c2ecf20Sopenharmony_ci *	and remove it from the wait queue.
10278c2ecf20Sopenharmony_ci *
10288c2ecf20Sopenharmony_ci *	Simple and straightforward.
10298c2ecf20Sopenharmony_ci *
10308c2ecf20Sopenharmony_ci *  (b) WQ_FLAG_EXCLUSIVE:
10318c2ecf20Sopenharmony_ci *
10328c2ecf20Sopenharmony_ci *	The waiter is waiting to get the lock, and only one waiter should
10338c2ecf20Sopenharmony_ci *	be woken up to avoid any thundering herd behavior. We'll set the
10348c2ecf20Sopenharmony_ci *	WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
10358c2ecf20Sopenharmony_ci *
10368c2ecf20Sopenharmony_ci *	This is the traditional exclusive wait.
10378c2ecf20Sopenharmony_ci *
10388c2ecf20Sopenharmony_ci *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
10398c2ecf20Sopenharmony_ci *
10408c2ecf20Sopenharmony_ci *	The waiter is waiting to get the bit, and additionally wants the
10418c2ecf20Sopenharmony_ci *	lock to be transferred to it for fair lock behavior. If the lock
10428c2ecf20Sopenharmony_ci *	cannot be taken, we stop walking the wait queue without waking
10438c2ecf20Sopenharmony_ci *	the waiter.
10448c2ecf20Sopenharmony_ci *
10458c2ecf20Sopenharmony_ci *	This is the "fair lock handoff" case, and in addition to setting
10468c2ecf20Sopenharmony_ci *	WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
10478c2ecf20Sopenharmony_ci *	that it now has the lock.
10488c2ecf20Sopenharmony_ci */
10498c2ecf20Sopenharmony_cistatic int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
10508c2ecf20Sopenharmony_ci{
10518c2ecf20Sopenharmony_ci	unsigned int flags;
10528c2ecf20Sopenharmony_ci	struct wait_page_key *key = arg;
10538c2ecf20Sopenharmony_ci	struct wait_page_queue *wait_page
10548c2ecf20Sopenharmony_ci		= container_of(wait, struct wait_page_queue, wait);
10558c2ecf20Sopenharmony_ci
10568c2ecf20Sopenharmony_ci	if (!wake_page_match(wait_page, key))
10578c2ecf20Sopenharmony_ci		return 0;
10588c2ecf20Sopenharmony_ci
10598c2ecf20Sopenharmony_ci	/*
10608c2ecf20Sopenharmony_ci	 * If it's a lock handoff wait, we get the bit for it, and
10618c2ecf20Sopenharmony_ci	 * stop walking (and do not wake it up) if we can't.
10628c2ecf20Sopenharmony_ci	 */
10638c2ecf20Sopenharmony_ci	flags = wait->flags;
10648c2ecf20Sopenharmony_ci	if (flags & WQ_FLAG_EXCLUSIVE) {
10658c2ecf20Sopenharmony_ci		if (test_bit(key->bit_nr, &key->page->flags))
10668c2ecf20Sopenharmony_ci			return -1;
10678c2ecf20Sopenharmony_ci		if (flags & WQ_FLAG_CUSTOM) {
10688c2ecf20Sopenharmony_ci			if (test_and_set_bit(key->bit_nr, &key->page->flags))
10698c2ecf20Sopenharmony_ci				return -1;
10708c2ecf20Sopenharmony_ci			flags |= WQ_FLAG_DONE;
10718c2ecf20Sopenharmony_ci		}
10728c2ecf20Sopenharmony_ci	}
10738c2ecf20Sopenharmony_ci
10748c2ecf20Sopenharmony_ci	/*
10758c2ecf20Sopenharmony_ci	 * We are holding the wait-queue lock, but the waiter that
10768c2ecf20Sopenharmony_ci	 * is waiting for this will be checking the flags without
10778c2ecf20Sopenharmony_ci	 * any locking.
10788c2ecf20Sopenharmony_ci	 *
10798c2ecf20Sopenharmony_ci	 * So update the flags atomically, and wake up the waiter
10808c2ecf20Sopenharmony_ci	 * afterwards to avoid any races. This store-release pairs
10818c2ecf20Sopenharmony_ci	 * with the load-acquire in wait_on_page_bit_common().
10828c2ecf20Sopenharmony_ci	 */
10838c2ecf20Sopenharmony_ci	smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
10848c2ecf20Sopenharmony_ci	wake_up_state(wait->private, mode);
10858c2ecf20Sopenharmony_ci
10868c2ecf20Sopenharmony_ci	/*
10878c2ecf20Sopenharmony_ci	 * Ok, we have successfully done what we're waiting for,
10888c2ecf20Sopenharmony_ci	 * and we can unconditionally remove the wait entry.
10898c2ecf20Sopenharmony_ci	 *
10908c2ecf20Sopenharmony_ci	 * Note that this pairs with the "finish_wait()" in the
10918c2ecf20Sopenharmony_ci	 * waiter, and has to be the absolute last thing we do.
10928c2ecf20Sopenharmony_ci	 * After this list_del_init(&wait->entry) the wait entry
10938c2ecf20Sopenharmony_ci	 * might be de-allocated and the process might even have
10948c2ecf20Sopenharmony_ci	 * exited.
10958c2ecf20Sopenharmony_ci	 */
10968c2ecf20Sopenharmony_ci	list_del_init_careful(&wait->entry);
10978c2ecf20Sopenharmony_ci	return (flags & WQ_FLAG_EXCLUSIVE) != 0;
10988c2ecf20Sopenharmony_ci}
10998c2ecf20Sopenharmony_ci
11008c2ecf20Sopenharmony_cistatic void wake_up_page_bit(struct page *page, int bit_nr)
11018c2ecf20Sopenharmony_ci{
11028c2ecf20Sopenharmony_ci	wait_queue_head_t *q = page_waitqueue(page);
11038c2ecf20Sopenharmony_ci	struct wait_page_key key;
11048c2ecf20Sopenharmony_ci	unsigned long flags;
11058c2ecf20Sopenharmony_ci	wait_queue_entry_t bookmark;
11068c2ecf20Sopenharmony_ci
11078c2ecf20Sopenharmony_ci	key.page = page;
11088c2ecf20Sopenharmony_ci	key.bit_nr = bit_nr;
11098c2ecf20Sopenharmony_ci	key.page_match = 0;
11108c2ecf20Sopenharmony_ci
11118c2ecf20Sopenharmony_ci	bookmark.flags = 0;
11128c2ecf20Sopenharmony_ci	bookmark.private = NULL;
11138c2ecf20Sopenharmony_ci	bookmark.func = NULL;
11148c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&bookmark.entry);
11158c2ecf20Sopenharmony_ci
11168c2ecf20Sopenharmony_ci	spin_lock_irqsave(&q->lock, flags);
11178c2ecf20Sopenharmony_ci	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
11188c2ecf20Sopenharmony_ci
11198c2ecf20Sopenharmony_ci	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
11208c2ecf20Sopenharmony_ci		/*
11218c2ecf20Sopenharmony_ci		 * Take a breather from holding the lock,
11228c2ecf20Sopenharmony_ci		 * allow pages that finish wake up asynchronously
11238c2ecf20Sopenharmony_ci		 * to acquire the lock and remove themselves
11248c2ecf20Sopenharmony_ci		 * from wait queue
11258c2ecf20Sopenharmony_ci		 */
11268c2ecf20Sopenharmony_ci		spin_unlock_irqrestore(&q->lock, flags);
11278c2ecf20Sopenharmony_ci		cpu_relax();
11288c2ecf20Sopenharmony_ci		spin_lock_irqsave(&q->lock, flags);
11298c2ecf20Sopenharmony_ci		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
11308c2ecf20Sopenharmony_ci	}
11318c2ecf20Sopenharmony_ci
11328c2ecf20Sopenharmony_ci	/*
11338c2ecf20Sopenharmony_ci	 * It is possible for other pages to have collided on the waitqueue
11348c2ecf20Sopenharmony_ci	 * hash, so in that case check for a page match. That prevents a long-
11358c2ecf20Sopenharmony_ci	 * term waiter
11368c2ecf20Sopenharmony_ci	 *
11378c2ecf20Sopenharmony_ci	 * It is still possible to miss a case here, when we woke page waiters
11388c2ecf20Sopenharmony_ci	 * and removed them from the waitqueue, but there are still other
11398c2ecf20Sopenharmony_ci	 * page waiters.
11408c2ecf20Sopenharmony_ci	 */
11418c2ecf20Sopenharmony_ci	if (!waitqueue_active(q) || !key.page_match) {
11428c2ecf20Sopenharmony_ci		ClearPageWaiters(page);
11438c2ecf20Sopenharmony_ci		/*
11448c2ecf20Sopenharmony_ci		 * It's possible to miss clearing Waiters here, when we woke
11458c2ecf20Sopenharmony_ci		 * our page waiters, but the hashed waitqueue has waiters for
11468c2ecf20Sopenharmony_ci		 * other pages on it.
11478c2ecf20Sopenharmony_ci		 *
11488c2ecf20Sopenharmony_ci		 * That's okay, it's a rare case. The next waker will clear it.
11498c2ecf20Sopenharmony_ci		 */
11508c2ecf20Sopenharmony_ci	}
11518c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&q->lock, flags);
11528c2ecf20Sopenharmony_ci}
11538c2ecf20Sopenharmony_ci
11548c2ecf20Sopenharmony_cistatic void wake_up_page(struct page *page, int bit)
11558c2ecf20Sopenharmony_ci{
11568c2ecf20Sopenharmony_ci	if (!PageWaiters(page))
11578c2ecf20Sopenharmony_ci		return;
11588c2ecf20Sopenharmony_ci	wake_up_page_bit(page, bit);
11598c2ecf20Sopenharmony_ci}
11608c2ecf20Sopenharmony_ci
11618c2ecf20Sopenharmony_ci/*
11628c2ecf20Sopenharmony_ci * A choice of three behaviors for wait_on_page_bit_common():
11638c2ecf20Sopenharmony_ci */
11648c2ecf20Sopenharmony_cienum behavior {
11658c2ecf20Sopenharmony_ci	EXCLUSIVE,	/* Hold ref to page and take the bit when woken, like
11668c2ecf20Sopenharmony_ci			 * __lock_page() waiting on then setting PG_locked.
11678c2ecf20Sopenharmony_ci			 */
11688c2ecf20Sopenharmony_ci	SHARED,		/* Hold ref to page and check the bit when woken, like
11698c2ecf20Sopenharmony_ci			 * wait_on_page_writeback() waiting on PG_writeback.
11708c2ecf20Sopenharmony_ci			 */
11718c2ecf20Sopenharmony_ci	DROP,		/* Drop ref to page before wait, no check when woken,
11728c2ecf20Sopenharmony_ci			 * like put_and_wait_on_page_locked() on PG_locked.
11738c2ecf20Sopenharmony_ci			 */
11748c2ecf20Sopenharmony_ci};
11758c2ecf20Sopenharmony_ci
11768c2ecf20Sopenharmony_ci/*
11778c2ecf20Sopenharmony_ci * Attempt to check (or get) the page bit, and mark us done
11788c2ecf20Sopenharmony_ci * if successful.
11798c2ecf20Sopenharmony_ci */
11808c2ecf20Sopenharmony_cistatic inline bool trylock_page_bit_common(struct page *page, int bit_nr,
11818c2ecf20Sopenharmony_ci					struct wait_queue_entry *wait)
11828c2ecf20Sopenharmony_ci{
11838c2ecf20Sopenharmony_ci	if (wait->flags & WQ_FLAG_EXCLUSIVE) {
11848c2ecf20Sopenharmony_ci		if (test_and_set_bit(bit_nr, &page->flags))
11858c2ecf20Sopenharmony_ci			return false;
11868c2ecf20Sopenharmony_ci	} else if (test_bit(bit_nr, &page->flags))
11878c2ecf20Sopenharmony_ci		return false;
11888c2ecf20Sopenharmony_ci
11898c2ecf20Sopenharmony_ci	wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
11908c2ecf20Sopenharmony_ci	return true;
11918c2ecf20Sopenharmony_ci}
11928c2ecf20Sopenharmony_ci
11938c2ecf20Sopenharmony_ci/* How many times do we accept lock stealing from under a waiter? */
11948c2ecf20Sopenharmony_ciint sysctl_page_lock_unfairness = 5;
11958c2ecf20Sopenharmony_ci
11968c2ecf20Sopenharmony_cistatic inline int wait_on_page_bit_common(wait_queue_head_t *q,
11978c2ecf20Sopenharmony_ci	struct page *page, int bit_nr, int state, enum behavior behavior)
11988c2ecf20Sopenharmony_ci{
11998c2ecf20Sopenharmony_ci	int unfairness = sysctl_page_lock_unfairness;
12008c2ecf20Sopenharmony_ci	struct wait_page_queue wait_page;
12018c2ecf20Sopenharmony_ci	wait_queue_entry_t *wait = &wait_page.wait;
12028c2ecf20Sopenharmony_ci	bool thrashing = false;
12038c2ecf20Sopenharmony_ci	bool delayacct = false;
12048c2ecf20Sopenharmony_ci	unsigned long pflags;
12058c2ecf20Sopenharmony_ci
12068c2ecf20Sopenharmony_ci	if (bit_nr == PG_locked &&
12078c2ecf20Sopenharmony_ci	    !PageUptodate(page) && PageWorkingset(page)) {
12088c2ecf20Sopenharmony_ci		if (!PageSwapBacked(page)) {
12098c2ecf20Sopenharmony_ci			delayacct_thrashing_start();
12108c2ecf20Sopenharmony_ci			delayacct = true;
12118c2ecf20Sopenharmony_ci		}
12128c2ecf20Sopenharmony_ci		psi_memstall_enter(&pflags);
12138c2ecf20Sopenharmony_ci		thrashing = true;
12148c2ecf20Sopenharmony_ci	}
12158c2ecf20Sopenharmony_ci
12168c2ecf20Sopenharmony_ci	init_wait(wait);
12178c2ecf20Sopenharmony_ci	wait->func = wake_page_function;
12188c2ecf20Sopenharmony_ci	wait_page.page = page;
12198c2ecf20Sopenharmony_ci	wait_page.bit_nr = bit_nr;
12208c2ecf20Sopenharmony_ci
12218c2ecf20Sopenharmony_cirepeat:
12228c2ecf20Sopenharmony_ci	wait->flags = 0;
12238c2ecf20Sopenharmony_ci	if (behavior == EXCLUSIVE) {
12248c2ecf20Sopenharmony_ci		wait->flags = WQ_FLAG_EXCLUSIVE;
12258c2ecf20Sopenharmony_ci		if (--unfairness < 0)
12268c2ecf20Sopenharmony_ci			wait->flags |= WQ_FLAG_CUSTOM;
12278c2ecf20Sopenharmony_ci	}
12288c2ecf20Sopenharmony_ci
12298c2ecf20Sopenharmony_ci	/*
12308c2ecf20Sopenharmony_ci	 * Do one last check whether we can get the
12318c2ecf20Sopenharmony_ci	 * page bit synchronously.
12328c2ecf20Sopenharmony_ci	 *
12338c2ecf20Sopenharmony_ci	 * Do the SetPageWaiters() marking before that
12348c2ecf20Sopenharmony_ci	 * to let any waker we _just_ missed know they
12358c2ecf20Sopenharmony_ci	 * need to wake us up (otherwise they'll never
12368c2ecf20Sopenharmony_ci	 * even go to the slow case that looks at the
12378c2ecf20Sopenharmony_ci	 * page queue), and add ourselves to the wait
12388c2ecf20Sopenharmony_ci	 * queue if we need to sleep.
12398c2ecf20Sopenharmony_ci	 *
12408c2ecf20Sopenharmony_ci	 * This part needs to be done under the queue
12418c2ecf20Sopenharmony_ci	 * lock to avoid races.
12428c2ecf20Sopenharmony_ci	 */
12438c2ecf20Sopenharmony_ci	spin_lock_irq(&q->lock);
12448c2ecf20Sopenharmony_ci	SetPageWaiters(page);
12458c2ecf20Sopenharmony_ci	if (!trylock_page_bit_common(page, bit_nr, wait))
12468c2ecf20Sopenharmony_ci		__add_wait_queue_entry_tail(q, wait);
12478c2ecf20Sopenharmony_ci	spin_unlock_irq(&q->lock);
12488c2ecf20Sopenharmony_ci
12498c2ecf20Sopenharmony_ci	/*
12508c2ecf20Sopenharmony_ci	 * From now on, all the logic will be based on
12518c2ecf20Sopenharmony_ci	 * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
12528c2ecf20Sopenharmony_ci	 * see whether the page bit testing has already
12538c2ecf20Sopenharmony_ci	 * been done by the wake function.
12548c2ecf20Sopenharmony_ci	 *
12558c2ecf20Sopenharmony_ci	 * We can drop our reference to the page.
12568c2ecf20Sopenharmony_ci	 */
12578c2ecf20Sopenharmony_ci	if (behavior == DROP)
12588c2ecf20Sopenharmony_ci		put_page(page);
12598c2ecf20Sopenharmony_ci
12608c2ecf20Sopenharmony_ci	/*
12618c2ecf20Sopenharmony_ci	 * Note that until the "finish_wait()", or until
12628c2ecf20Sopenharmony_ci	 * we see the WQ_FLAG_WOKEN flag, we need to
12638c2ecf20Sopenharmony_ci	 * be very careful with the 'wait->flags', because
12648c2ecf20Sopenharmony_ci	 * we may race with a waker that sets them.
12658c2ecf20Sopenharmony_ci	 */
12668c2ecf20Sopenharmony_ci	for (;;) {
12678c2ecf20Sopenharmony_ci		unsigned int flags;
12688c2ecf20Sopenharmony_ci
12698c2ecf20Sopenharmony_ci		set_current_state(state);
12708c2ecf20Sopenharmony_ci
12718c2ecf20Sopenharmony_ci		/* Loop until we've been woken or interrupted */
12728c2ecf20Sopenharmony_ci		flags = smp_load_acquire(&wait->flags);
12738c2ecf20Sopenharmony_ci		if (!(flags & WQ_FLAG_WOKEN)) {
12748c2ecf20Sopenharmony_ci			if (signal_pending_state(state, current))
12758c2ecf20Sopenharmony_ci				break;
12768c2ecf20Sopenharmony_ci
12778c2ecf20Sopenharmony_ci			io_schedule();
12788c2ecf20Sopenharmony_ci			continue;
12798c2ecf20Sopenharmony_ci		}
12808c2ecf20Sopenharmony_ci
12818c2ecf20Sopenharmony_ci		/* If we were non-exclusive, we're done */
12828c2ecf20Sopenharmony_ci		if (behavior != EXCLUSIVE)
12838c2ecf20Sopenharmony_ci			break;
12848c2ecf20Sopenharmony_ci
12858c2ecf20Sopenharmony_ci		/* If the waker got the lock for us, we're done */
12868c2ecf20Sopenharmony_ci		if (flags & WQ_FLAG_DONE)
12878c2ecf20Sopenharmony_ci			break;
12888c2ecf20Sopenharmony_ci
12898c2ecf20Sopenharmony_ci		/*
12908c2ecf20Sopenharmony_ci		 * Otherwise, if we're getting the lock, we need to
12918c2ecf20Sopenharmony_ci		 * try to get it ourselves.
12928c2ecf20Sopenharmony_ci		 *
12938c2ecf20Sopenharmony_ci		 * And if that fails, we'll have to retry this all.
12948c2ecf20Sopenharmony_ci		 */
12958c2ecf20Sopenharmony_ci		if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
12968c2ecf20Sopenharmony_ci			goto repeat;
12978c2ecf20Sopenharmony_ci
12988c2ecf20Sopenharmony_ci		wait->flags |= WQ_FLAG_DONE;
12998c2ecf20Sopenharmony_ci		break;
13008c2ecf20Sopenharmony_ci	}
13018c2ecf20Sopenharmony_ci
13028c2ecf20Sopenharmony_ci	/*
13038c2ecf20Sopenharmony_ci	 * If a signal happened, this 'finish_wait()' may remove the last
13048c2ecf20Sopenharmony_ci	 * waiter from the wait-queues, but the PageWaiters bit will remain
13058c2ecf20Sopenharmony_ci	 * set. That's ok. The next wakeup will take care of it, and trying
13068c2ecf20Sopenharmony_ci	 * to do it here would be difficult and prone to races.
13078c2ecf20Sopenharmony_ci	 */
13088c2ecf20Sopenharmony_ci	finish_wait(q, wait);
13098c2ecf20Sopenharmony_ci
13108c2ecf20Sopenharmony_ci	if (thrashing) {
13118c2ecf20Sopenharmony_ci		if (delayacct)
13128c2ecf20Sopenharmony_ci			delayacct_thrashing_end();
13138c2ecf20Sopenharmony_ci		psi_memstall_leave(&pflags);
13148c2ecf20Sopenharmony_ci	}
13158c2ecf20Sopenharmony_ci
13168c2ecf20Sopenharmony_ci	/*
13178c2ecf20Sopenharmony_ci	 * NOTE! The wait->flags weren't stable until we've done the
13188c2ecf20Sopenharmony_ci	 * 'finish_wait()', and we could have exited the loop above due
13198c2ecf20Sopenharmony_ci	 * to a signal, and had a wakeup event happen after the signal
13208c2ecf20Sopenharmony_ci	 * test but before the 'finish_wait()'.
13218c2ecf20Sopenharmony_ci	 *
13228c2ecf20Sopenharmony_ci	 * So only after the finish_wait() can we reliably determine
13238c2ecf20Sopenharmony_ci	 * if we got woken up or not, so we can now figure out the final
13248c2ecf20Sopenharmony_ci	 * return value based on that state without races.
13258c2ecf20Sopenharmony_ci	 *
13268c2ecf20Sopenharmony_ci	 * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
13278c2ecf20Sopenharmony_ci	 * waiter, but an exclusive one requires WQ_FLAG_DONE.
13288c2ecf20Sopenharmony_ci	 */
13298c2ecf20Sopenharmony_ci	if (behavior == EXCLUSIVE)
13308c2ecf20Sopenharmony_ci		return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
13318c2ecf20Sopenharmony_ci
13328c2ecf20Sopenharmony_ci	return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
13338c2ecf20Sopenharmony_ci}
13348c2ecf20Sopenharmony_ci
13358c2ecf20Sopenharmony_civoid wait_on_page_bit(struct page *page, int bit_nr)
13368c2ecf20Sopenharmony_ci{
13378c2ecf20Sopenharmony_ci	wait_queue_head_t *q = page_waitqueue(page);
13388c2ecf20Sopenharmony_ci	wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
13398c2ecf20Sopenharmony_ci}
13408c2ecf20Sopenharmony_ciEXPORT_SYMBOL(wait_on_page_bit);
13418c2ecf20Sopenharmony_ci
13428c2ecf20Sopenharmony_ciint wait_on_page_bit_killable(struct page *page, int bit_nr)
13438c2ecf20Sopenharmony_ci{
13448c2ecf20Sopenharmony_ci	wait_queue_head_t *q = page_waitqueue(page);
13458c2ecf20Sopenharmony_ci	return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
13468c2ecf20Sopenharmony_ci}
13478c2ecf20Sopenharmony_ciEXPORT_SYMBOL(wait_on_page_bit_killable);
13488c2ecf20Sopenharmony_ci
13498c2ecf20Sopenharmony_cistatic int __wait_on_page_locked_async(struct page *page,
13508c2ecf20Sopenharmony_ci				       struct wait_page_queue *wait, bool set)
13518c2ecf20Sopenharmony_ci{
13528c2ecf20Sopenharmony_ci	struct wait_queue_head *q = page_waitqueue(page);
13538c2ecf20Sopenharmony_ci	int ret = 0;
13548c2ecf20Sopenharmony_ci
13558c2ecf20Sopenharmony_ci	wait->page = page;
13568c2ecf20Sopenharmony_ci	wait->bit_nr = PG_locked;
13578c2ecf20Sopenharmony_ci
13588c2ecf20Sopenharmony_ci	spin_lock_irq(&q->lock);
13598c2ecf20Sopenharmony_ci	__add_wait_queue_entry_tail(q, &wait->wait);
13608c2ecf20Sopenharmony_ci	SetPageWaiters(page);
13618c2ecf20Sopenharmony_ci	if (set)
13628c2ecf20Sopenharmony_ci		ret = !trylock_page(page);
13638c2ecf20Sopenharmony_ci	else
13648c2ecf20Sopenharmony_ci		ret = PageLocked(page);
13658c2ecf20Sopenharmony_ci	/*
13668c2ecf20Sopenharmony_ci	 * If we were succesful now, we know we're still on the
13678c2ecf20Sopenharmony_ci	 * waitqueue as we're still under the lock. This means it's
13688c2ecf20Sopenharmony_ci	 * safe to remove and return success, we know the callback
13698c2ecf20Sopenharmony_ci	 * isn't going to trigger.
13708c2ecf20Sopenharmony_ci	 */
13718c2ecf20Sopenharmony_ci	if (!ret)
13728c2ecf20Sopenharmony_ci		__remove_wait_queue(q, &wait->wait);
13738c2ecf20Sopenharmony_ci	else
13748c2ecf20Sopenharmony_ci		ret = -EIOCBQUEUED;
13758c2ecf20Sopenharmony_ci	spin_unlock_irq(&q->lock);
13768c2ecf20Sopenharmony_ci	return ret;
13778c2ecf20Sopenharmony_ci}
13788c2ecf20Sopenharmony_ci
13798c2ecf20Sopenharmony_cistatic int wait_on_page_locked_async(struct page *page,
13808c2ecf20Sopenharmony_ci				     struct wait_page_queue *wait)
13818c2ecf20Sopenharmony_ci{
13828c2ecf20Sopenharmony_ci	if (!PageLocked(page))
13838c2ecf20Sopenharmony_ci		return 0;
13848c2ecf20Sopenharmony_ci	return __wait_on_page_locked_async(compound_head(page), wait, false);
13858c2ecf20Sopenharmony_ci}
13868c2ecf20Sopenharmony_ci
13878c2ecf20Sopenharmony_ci/**
13888c2ecf20Sopenharmony_ci * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
13898c2ecf20Sopenharmony_ci * @page: The page to wait for.
13908c2ecf20Sopenharmony_ci *
13918c2ecf20Sopenharmony_ci * The caller should hold a reference on @page.  They expect the page to
13928c2ecf20Sopenharmony_ci * become unlocked relatively soon, but do not wish to hold up migration
13938c2ecf20Sopenharmony_ci * (for example) by holding the reference while waiting for the page to
13948c2ecf20Sopenharmony_ci * come unlocked.  After this function returns, the caller should not
13958c2ecf20Sopenharmony_ci * dereference @page.
13968c2ecf20Sopenharmony_ci */
13978c2ecf20Sopenharmony_civoid put_and_wait_on_page_locked(struct page *page)
13988c2ecf20Sopenharmony_ci{
13998c2ecf20Sopenharmony_ci	wait_queue_head_t *q;
14008c2ecf20Sopenharmony_ci
14018c2ecf20Sopenharmony_ci	page = compound_head(page);
14028c2ecf20Sopenharmony_ci	q = page_waitqueue(page);
14038c2ecf20Sopenharmony_ci	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
14048c2ecf20Sopenharmony_ci}
14058c2ecf20Sopenharmony_ci
14068c2ecf20Sopenharmony_ci/**
14078c2ecf20Sopenharmony_ci * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
14088c2ecf20Sopenharmony_ci * @page: Page defining the wait queue of interest
14098c2ecf20Sopenharmony_ci * @waiter: Waiter to add to the queue
14108c2ecf20Sopenharmony_ci *
14118c2ecf20Sopenharmony_ci * Add an arbitrary @waiter to the wait queue for the nominated @page.
14128c2ecf20Sopenharmony_ci */
14138c2ecf20Sopenharmony_civoid add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
14148c2ecf20Sopenharmony_ci{
14158c2ecf20Sopenharmony_ci	wait_queue_head_t *q = page_waitqueue(page);
14168c2ecf20Sopenharmony_ci	unsigned long flags;
14178c2ecf20Sopenharmony_ci
14188c2ecf20Sopenharmony_ci	spin_lock_irqsave(&q->lock, flags);
14198c2ecf20Sopenharmony_ci	__add_wait_queue_entry_tail(q, waiter);
14208c2ecf20Sopenharmony_ci	SetPageWaiters(page);
14218c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&q->lock, flags);
14228c2ecf20Sopenharmony_ci}
14238c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(add_page_wait_queue);
14248c2ecf20Sopenharmony_ci
14258c2ecf20Sopenharmony_ci#ifndef clear_bit_unlock_is_negative_byte
14268c2ecf20Sopenharmony_ci
14278c2ecf20Sopenharmony_ci/*
14288c2ecf20Sopenharmony_ci * PG_waiters is the high bit in the same byte as PG_lock.
14298c2ecf20Sopenharmony_ci *
14308c2ecf20Sopenharmony_ci * On x86 (and on many other architectures), we can clear PG_lock and
14318c2ecf20Sopenharmony_ci * test the sign bit at the same time. But if the architecture does
14328c2ecf20Sopenharmony_ci * not support that special operation, we just do this all by hand
14338c2ecf20Sopenharmony_ci * instead.
14348c2ecf20Sopenharmony_ci *
14358c2ecf20Sopenharmony_ci * The read of PG_waiters has to be after (or concurrently with) PG_locked
14368c2ecf20Sopenharmony_ci * being cleared, but a memory barrier should be unnecessary since it is
14378c2ecf20Sopenharmony_ci * in the same byte as PG_locked.
14388c2ecf20Sopenharmony_ci */
14398c2ecf20Sopenharmony_cistatic inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
14408c2ecf20Sopenharmony_ci{
14418c2ecf20Sopenharmony_ci	clear_bit_unlock(nr, mem);
14428c2ecf20Sopenharmony_ci	/* smp_mb__after_atomic(); */
14438c2ecf20Sopenharmony_ci	return test_bit(PG_waiters, mem);
14448c2ecf20Sopenharmony_ci}
14458c2ecf20Sopenharmony_ci
14468c2ecf20Sopenharmony_ci#endif
14478c2ecf20Sopenharmony_ci
14488c2ecf20Sopenharmony_ci/**
14498c2ecf20Sopenharmony_ci * unlock_page - unlock a locked page
14508c2ecf20Sopenharmony_ci * @page: the page
14518c2ecf20Sopenharmony_ci *
14528c2ecf20Sopenharmony_ci * Unlocks the page and wakes up sleepers in wait_on_page_locked().
14538c2ecf20Sopenharmony_ci * Also wakes sleepers in wait_on_page_writeback() because the wakeup
14548c2ecf20Sopenharmony_ci * mechanism between PageLocked pages and PageWriteback pages is shared.
14558c2ecf20Sopenharmony_ci * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
14568c2ecf20Sopenharmony_ci *
14578c2ecf20Sopenharmony_ci * Note that this depends on PG_waiters being the sign bit in the byte
14588c2ecf20Sopenharmony_ci * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
14598c2ecf20Sopenharmony_ci * clear the PG_locked bit and test PG_waiters at the same time fairly
14608c2ecf20Sopenharmony_ci * portably (architectures that do LL/SC can test any bit, while x86 can
14618c2ecf20Sopenharmony_ci * test the sign bit).
14628c2ecf20Sopenharmony_ci */
14638c2ecf20Sopenharmony_civoid unlock_page(struct page *page)
14648c2ecf20Sopenharmony_ci{
14658c2ecf20Sopenharmony_ci	BUILD_BUG_ON(PG_waiters != 7);
14668c2ecf20Sopenharmony_ci	page = compound_head(page);
14678c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!PageLocked(page), page);
14688c2ecf20Sopenharmony_ci	if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
14698c2ecf20Sopenharmony_ci		wake_up_page_bit(page, PG_locked);
14708c2ecf20Sopenharmony_ci}
14718c2ecf20Sopenharmony_ciEXPORT_SYMBOL(unlock_page);
14728c2ecf20Sopenharmony_ci
14738c2ecf20Sopenharmony_ci/**
14748c2ecf20Sopenharmony_ci * end_page_writeback - end writeback against a page
14758c2ecf20Sopenharmony_ci * @page: the page
14768c2ecf20Sopenharmony_ci */
14778c2ecf20Sopenharmony_civoid end_page_writeback(struct page *page)
14788c2ecf20Sopenharmony_ci{
14798c2ecf20Sopenharmony_ci	/*
14808c2ecf20Sopenharmony_ci	 * TestClearPageReclaim could be used here but it is an atomic
14818c2ecf20Sopenharmony_ci	 * operation and overkill in this particular case. Failing to
14828c2ecf20Sopenharmony_ci	 * shuffle a page marked for immediate reclaim is too mild to
14838c2ecf20Sopenharmony_ci	 * justify taking an atomic operation penalty at the end of
14848c2ecf20Sopenharmony_ci	 * ever page writeback.
14858c2ecf20Sopenharmony_ci	 */
14868c2ecf20Sopenharmony_ci	if (PageReclaim(page)) {
14878c2ecf20Sopenharmony_ci		ClearPageReclaim(page);
14888c2ecf20Sopenharmony_ci		rotate_reclaimable_page(page);
14898c2ecf20Sopenharmony_ci	}
14908c2ecf20Sopenharmony_ci
14918c2ecf20Sopenharmony_ci	/*
14928c2ecf20Sopenharmony_ci	 * Writeback does not hold a page reference of its own, relying
14938c2ecf20Sopenharmony_ci	 * on truncation to wait for the clearing of PG_writeback.
14948c2ecf20Sopenharmony_ci	 * But here we must make sure that the page is not freed and
14958c2ecf20Sopenharmony_ci	 * reused before the wake_up_page().
14968c2ecf20Sopenharmony_ci	 */
14978c2ecf20Sopenharmony_ci	get_page(page);
14988c2ecf20Sopenharmony_ci	if (!test_clear_page_writeback(page))
14998c2ecf20Sopenharmony_ci		BUG();
15008c2ecf20Sopenharmony_ci
15018c2ecf20Sopenharmony_ci	smp_mb__after_atomic();
15028c2ecf20Sopenharmony_ci	wake_up_page(page, PG_writeback);
15038c2ecf20Sopenharmony_ci	put_page(page);
15048c2ecf20Sopenharmony_ci}
15058c2ecf20Sopenharmony_ciEXPORT_SYMBOL(end_page_writeback);
15068c2ecf20Sopenharmony_ci
15078c2ecf20Sopenharmony_ci/*
15088c2ecf20Sopenharmony_ci * After completing I/O on a page, call this routine to update the page
15098c2ecf20Sopenharmony_ci * flags appropriately
15108c2ecf20Sopenharmony_ci */
15118c2ecf20Sopenharmony_civoid page_endio(struct page *page, bool is_write, int err)
15128c2ecf20Sopenharmony_ci{
15138c2ecf20Sopenharmony_ci	if (!is_write) {
15148c2ecf20Sopenharmony_ci		if (!err) {
15158c2ecf20Sopenharmony_ci			SetPageUptodate(page);
15168c2ecf20Sopenharmony_ci		} else {
15178c2ecf20Sopenharmony_ci			ClearPageUptodate(page);
15188c2ecf20Sopenharmony_ci			SetPageError(page);
15198c2ecf20Sopenharmony_ci		}
15208c2ecf20Sopenharmony_ci		unlock_page(page);
15218c2ecf20Sopenharmony_ci	} else {
15228c2ecf20Sopenharmony_ci		if (err) {
15238c2ecf20Sopenharmony_ci			struct address_space *mapping;
15248c2ecf20Sopenharmony_ci
15258c2ecf20Sopenharmony_ci			SetPageError(page);
15268c2ecf20Sopenharmony_ci			mapping = page_mapping(page);
15278c2ecf20Sopenharmony_ci			if (mapping)
15288c2ecf20Sopenharmony_ci				mapping_set_error(mapping, err);
15298c2ecf20Sopenharmony_ci		}
15308c2ecf20Sopenharmony_ci		end_page_writeback(page);
15318c2ecf20Sopenharmony_ci	}
15328c2ecf20Sopenharmony_ci}
15338c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(page_endio);
15348c2ecf20Sopenharmony_ci
15358c2ecf20Sopenharmony_ci/**
15368c2ecf20Sopenharmony_ci * __lock_page - get a lock on the page, assuming we need to sleep to get it
15378c2ecf20Sopenharmony_ci * @__page: the page to lock
15388c2ecf20Sopenharmony_ci */
15398c2ecf20Sopenharmony_civoid __lock_page(struct page *__page)
15408c2ecf20Sopenharmony_ci{
15418c2ecf20Sopenharmony_ci	struct page *page = compound_head(__page);
15428c2ecf20Sopenharmony_ci	wait_queue_head_t *q = page_waitqueue(page);
15438c2ecf20Sopenharmony_ci	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
15448c2ecf20Sopenharmony_ci				EXCLUSIVE);
15458c2ecf20Sopenharmony_ci}
15468c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__lock_page);
15478c2ecf20Sopenharmony_ci
15488c2ecf20Sopenharmony_ciint __lock_page_killable(struct page *__page)
15498c2ecf20Sopenharmony_ci{
15508c2ecf20Sopenharmony_ci	struct page *page = compound_head(__page);
15518c2ecf20Sopenharmony_ci	wait_queue_head_t *q = page_waitqueue(page);
15528c2ecf20Sopenharmony_ci	return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
15538c2ecf20Sopenharmony_ci					EXCLUSIVE);
15548c2ecf20Sopenharmony_ci}
15558c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(__lock_page_killable);
15568c2ecf20Sopenharmony_ci
15578c2ecf20Sopenharmony_ciint __lock_page_async(struct page *page, struct wait_page_queue *wait)
15588c2ecf20Sopenharmony_ci{
15598c2ecf20Sopenharmony_ci	return __wait_on_page_locked_async(page, wait, true);
15608c2ecf20Sopenharmony_ci}
15618c2ecf20Sopenharmony_ci
15628c2ecf20Sopenharmony_ci/*
15638c2ecf20Sopenharmony_ci * Return values:
15648c2ecf20Sopenharmony_ci * 1 - page is locked; mmap_lock is still held.
15658c2ecf20Sopenharmony_ci * 0 - page is not locked.
15668c2ecf20Sopenharmony_ci *     mmap_lock has been released (mmap_read_unlock(), unless flags had both
15678c2ecf20Sopenharmony_ci *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
15688c2ecf20Sopenharmony_ci *     which case mmap_lock is still held.
15698c2ecf20Sopenharmony_ci *
15708c2ecf20Sopenharmony_ci * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
15718c2ecf20Sopenharmony_ci * with the page locked and the mmap_lock unperturbed.
15728c2ecf20Sopenharmony_ci */
15738c2ecf20Sopenharmony_ciint __lock_page_or_retry(struct page *page, struct mm_struct *mm,
15748c2ecf20Sopenharmony_ci			 unsigned int flags)
15758c2ecf20Sopenharmony_ci{
15768c2ecf20Sopenharmony_ci	if (fault_flag_allow_retry_first(flags)) {
15778c2ecf20Sopenharmony_ci		/*
15788c2ecf20Sopenharmony_ci		 * CAUTION! In this case, mmap_lock is not released
15798c2ecf20Sopenharmony_ci		 * even though return 0.
15808c2ecf20Sopenharmony_ci		 */
15818c2ecf20Sopenharmony_ci		if (flags & FAULT_FLAG_RETRY_NOWAIT)
15828c2ecf20Sopenharmony_ci			return 0;
15838c2ecf20Sopenharmony_ci
15848c2ecf20Sopenharmony_ci		mmap_read_unlock(mm);
15858c2ecf20Sopenharmony_ci		if (flags & FAULT_FLAG_KILLABLE)
15868c2ecf20Sopenharmony_ci			wait_on_page_locked_killable(page);
15878c2ecf20Sopenharmony_ci		else
15888c2ecf20Sopenharmony_ci			wait_on_page_locked(page);
15898c2ecf20Sopenharmony_ci		return 0;
15908c2ecf20Sopenharmony_ci	} else {
15918c2ecf20Sopenharmony_ci		if (flags & FAULT_FLAG_KILLABLE) {
15928c2ecf20Sopenharmony_ci			int ret;
15938c2ecf20Sopenharmony_ci
15948c2ecf20Sopenharmony_ci			ret = __lock_page_killable(page);
15958c2ecf20Sopenharmony_ci			if (ret) {
15968c2ecf20Sopenharmony_ci				mmap_read_unlock(mm);
15978c2ecf20Sopenharmony_ci				return 0;
15988c2ecf20Sopenharmony_ci			}
15998c2ecf20Sopenharmony_ci		} else
16008c2ecf20Sopenharmony_ci			__lock_page(page);
16018c2ecf20Sopenharmony_ci		return 1;
16028c2ecf20Sopenharmony_ci	}
16038c2ecf20Sopenharmony_ci}
16048c2ecf20Sopenharmony_ci
16058c2ecf20Sopenharmony_ci/**
16068c2ecf20Sopenharmony_ci * page_cache_next_miss() - Find the next gap in the page cache.
16078c2ecf20Sopenharmony_ci * @mapping: Mapping.
16088c2ecf20Sopenharmony_ci * @index: Index.
16098c2ecf20Sopenharmony_ci * @max_scan: Maximum range to search.
16108c2ecf20Sopenharmony_ci *
16118c2ecf20Sopenharmony_ci * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
16128c2ecf20Sopenharmony_ci * gap with the lowest index.
16138c2ecf20Sopenharmony_ci *
16148c2ecf20Sopenharmony_ci * This function may be called under the rcu_read_lock.  However, this will
16158c2ecf20Sopenharmony_ci * not atomically search a snapshot of the cache at a single point in time.
16168c2ecf20Sopenharmony_ci * For example, if a gap is created at index 5, then subsequently a gap is
16178c2ecf20Sopenharmony_ci * created at index 10, page_cache_next_miss covering both indices may
16188c2ecf20Sopenharmony_ci * return 10 if called under the rcu_read_lock.
16198c2ecf20Sopenharmony_ci *
16208c2ecf20Sopenharmony_ci * Return: The index of the gap if found, otherwise an index outside the
16218c2ecf20Sopenharmony_ci * range specified (in which case 'return - index >= max_scan' will be true).
16228c2ecf20Sopenharmony_ci * In the rare case of index wrap-around, 0 will be returned.
16238c2ecf20Sopenharmony_ci */
16248c2ecf20Sopenharmony_cipgoff_t page_cache_next_miss(struct address_space *mapping,
16258c2ecf20Sopenharmony_ci			     pgoff_t index, unsigned long max_scan)
16268c2ecf20Sopenharmony_ci{
16278c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, index);
16288c2ecf20Sopenharmony_ci
16298c2ecf20Sopenharmony_ci	while (max_scan--) {
16308c2ecf20Sopenharmony_ci		void *entry = xas_next(&xas);
16318c2ecf20Sopenharmony_ci		if (!entry || xa_is_value(entry))
16328c2ecf20Sopenharmony_ci			break;
16338c2ecf20Sopenharmony_ci		if (xas.xa_index == 0)
16348c2ecf20Sopenharmony_ci			break;
16358c2ecf20Sopenharmony_ci	}
16368c2ecf20Sopenharmony_ci
16378c2ecf20Sopenharmony_ci	return xas.xa_index;
16388c2ecf20Sopenharmony_ci}
16398c2ecf20Sopenharmony_ciEXPORT_SYMBOL(page_cache_next_miss);
16408c2ecf20Sopenharmony_ci
16418c2ecf20Sopenharmony_ci/**
16428c2ecf20Sopenharmony_ci * page_cache_prev_miss() - Find the previous gap in the page cache.
16438c2ecf20Sopenharmony_ci * @mapping: Mapping.
16448c2ecf20Sopenharmony_ci * @index: Index.
16458c2ecf20Sopenharmony_ci * @max_scan: Maximum range to search.
16468c2ecf20Sopenharmony_ci *
16478c2ecf20Sopenharmony_ci * Search the range [max(index - max_scan + 1, 0), index] for the
16488c2ecf20Sopenharmony_ci * gap with the highest index.
16498c2ecf20Sopenharmony_ci *
16508c2ecf20Sopenharmony_ci * This function may be called under the rcu_read_lock.  However, this will
16518c2ecf20Sopenharmony_ci * not atomically search a snapshot of the cache at a single point in time.
16528c2ecf20Sopenharmony_ci * For example, if a gap is created at index 10, then subsequently a gap is
16538c2ecf20Sopenharmony_ci * created at index 5, page_cache_prev_miss() covering both indices may
16548c2ecf20Sopenharmony_ci * return 5 if called under the rcu_read_lock.
16558c2ecf20Sopenharmony_ci *
16568c2ecf20Sopenharmony_ci * Return: The index of the gap if found, otherwise an index outside the
16578c2ecf20Sopenharmony_ci * range specified (in which case 'index - return >= max_scan' will be true).
16588c2ecf20Sopenharmony_ci * In the rare case of wrap-around, ULONG_MAX will be returned.
16598c2ecf20Sopenharmony_ci */
16608c2ecf20Sopenharmony_cipgoff_t page_cache_prev_miss(struct address_space *mapping,
16618c2ecf20Sopenharmony_ci			     pgoff_t index, unsigned long max_scan)
16628c2ecf20Sopenharmony_ci{
16638c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, index);
16648c2ecf20Sopenharmony_ci
16658c2ecf20Sopenharmony_ci	while (max_scan--) {
16668c2ecf20Sopenharmony_ci		void *entry = xas_prev(&xas);
16678c2ecf20Sopenharmony_ci		if (!entry || xa_is_value(entry))
16688c2ecf20Sopenharmony_ci			break;
16698c2ecf20Sopenharmony_ci		if (xas.xa_index == ULONG_MAX)
16708c2ecf20Sopenharmony_ci			break;
16718c2ecf20Sopenharmony_ci	}
16728c2ecf20Sopenharmony_ci
16738c2ecf20Sopenharmony_ci	return xas.xa_index;
16748c2ecf20Sopenharmony_ci}
16758c2ecf20Sopenharmony_ciEXPORT_SYMBOL(page_cache_prev_miss);
16768c2ecf20Sopenharmony_ci
16778c2ecf20Sopenharmony_ci/**
16788c2ecf20Sopenharmony_ci * find_get_entry - find and get a page cache entry
16798c2ecf20Sopenharmony_ci * @mapping: the address_space to search
16808c2ecf20Sopenharmony_ci * @index: The page cache index.
16818c2ecf20Sopenharmony_ci *
16828c2ecf20Sopenharmony_ci * Looks up the page cache slot at @mapping & @offset.  If there is a
16838c2ecf20Sopenharmony_ci * page cache page, the head page is returned with an increased refcount.
16848c2ecf20Sopenharmony_ci *
16858c2ecf20Sopenharmony_ci * If the slot holds a shadow entry of a previously evicted page, or a
16868c2ecf20Sopenharmony_ci * swap entry from shmem/tmpfs, it is returned.
16878c2ecf20Sopenharmony_ci *
16888c2ecf20Sopenharmony_ci * Return: The head page or shadow entry, %NULL if nothing is found.
16898c2ecf20Sopenharmony_ci */
16908c2ecf20Sopenharmony_cistruct page *find_get_entry(struct address_space *mapping, pgoff_t index)
16918c2ecf20Sopenharmony_ci{
16928c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, index);
16938c2ecf20Sopenharmony_ci	struct page *page;
16948c2ecf20Sopenharmony_ci
16958c2ecf20Sopenharmony_ci	rcu_read_lock();
16968c2ecf20Sopenharmony_cirepeat:
16978c2ecf20Sopenharmony_ci	xas_reset(&xas);
16988c2ecf20Sopenharmony_ci	page = xas_load(&xas);
16998c2ecf20Sopenharmony_ci	if (xas_retry(&xas, page))
17008c2ecf20Sopenharmony_ci		goto repeat;
17018c2ecf20Sopenharmony_ci	/*
17028c2ecf20Sopenharmony_ci	 * A shadow entry of a recently evicted page, or a swap entry from
17038c2ecf20Sopenharmony_ci	 * shmem/tmpfs.  Return it without attempting to raise page count.
17048c2ecf20Sopenharmony_ci	 */
17058c2ecf20Sopenharmony_ci	if (!page || xa_is_value(page))
17068c2ecf20Sopenharmony_ci		goto out;
17078c2ecf20Sopenharmony_ci
17088c2ecf20Sopenharmony_ci	if (!page_cache_get_speculative(page))
17098c2ecf20Sopenharmony_ci		goto repeat;
17108c2ecf20Sopenharmony_ci
17118c2ecf20Sopenharmony_ci	/*
17128c2ecf20Sopenharmony_ci	 * Has the page moved or been split?
17138c2ecf20Sopenharmony_ci	 * This is part of the lockless pagecache protocol. See
17148c2ecf20Sopenharmony_ci	 * include/linux/pagemap.h for details.
17158c2ecf20Sopenharmony_ci	 */
17168c2ecf20Sopenharmony_ci	if (unlikely(page != xas_reload(&xas))) {
17178c2ecf20Sopenharmony_ci		put_page(page);
17188c2ecf20Sopenharmony_ci		goto repeat;
17198c2ecf20Sopenharmony_ci	}
17208c2ecf20Sopenharmony_ciout:
17218c2ecf20Sopenharmony_ci	rcu_read_unlock();
17228c2ecf20Sopenharmony_ci
17238c2ecf20Sopenharmony_ci	return page;
17248c2ecf20Sopenharmony_ci}
17258c2ecf20Sopenharmony_ci
17268c2ecf20Sopenharmony_ci/**
17278c2ecf20Sopenharmony_ci * find_lock_entry - Locate and lock a page cache entry.
17288c2ecf20Sopenharmony_ci * @mapping: The address_space to search.
17298c2ecf20Sopenharmony_ci * @index: The page cache index.
17308c2ecf20Sopenharmony_ci *
17318c2ecf20Sopenharmony_ci * Looks up the page at @mapping & @index.  If there is a page in the
17328c2ecf20Sopenharmony_ci * cache, the head page is returned locked and with an increased refcount.
17338c2ecf20Sopenharmony_ci *
17348c2ecf20Sopenharmony_ci * If the slot holds a shadow entry of a previously evicted page, or a
17358c2ecf20Sopenharmony_ci * swap entry from shmem/tmpfs, it is returned.
17368c2ecf20Sopenharmony_ci *
17378c2ecf20Sopenharmony_ci * Context: May sleep.
17388c2ecf20Sopenharmony_ci * Return: The head page or shadow entry, %NULL if nothing is found.
17398c2ecf20Sopenharmony_ci */
17408c2ecf20Sopenharmony_cistruct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
17418c2ecf20Sopenharmony_ci{
17428c2ecf20Sopenharmony_ci	struct page *page;
17438c2ecf20Sopenharmony_ci
17448c2ecf20Sopenharmony_cirepeat:
17458c2ecf20Sopenharmony_ci	page = find_get_entry(mapping, index);
17468c2ecf20Sopenharmony_ci	if (page && !xa_is_value(page)) {
17478c2ecf20Sopenharmony_ci		lock_page(page);
17488c2ecf20Sopenharmony_ci		/* Has the page been truncated? */
17498c2ecf20Sopenharmony_ci		if (unlikely(page->mapping != mapping)) {
17508c2ecf20Sopenharmony_ci			unlock_page(page);
17518c2ecf20Sopenharmony_ci			put_page(page);
17528c2ecf20Sopenharmony_ci			goto repeat;
17538c2ecf20Sopenharmony_ci		}
17548c2ecf20Sopenharmony_ci		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
17558c2ecf20Sopenharmony_ci	}
17568c2ecf20Sopenharmony_ci	return page;
17578c2ecf20Sopenharmony_ci}
17588c2ecf20Sopenharmony_ci
17598c2ecf20Sopenharmony_ci/**
17608c2ecf20Sopenharmony_ci * pagecache_get_page - Find and get a reference to a page.
17618c2ecf20Sopenharmony_ci * @mapping: The address_space to search.
17628c2ecf20Sopenharmony_ci * @index: The page index.
17638c2ecf20Sopenharmony_ci * @fgp_flags: %FGP flags modify how the page is returned.
17648c2ecf20Sopenharmony_ci * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
17658c2ecf20Sopenharmony_ci *
17668c2ecf20Sopenharmony_ci * Looks up the page cache entry at @mapping & @index.
17678c2ecf20Sopenharmony_ci *
17688c2ecf20Sopenharmony_ci * @fgp_flags can be zero or more of these flags:
17698c2ecf20Sopenharmony_ci *
17708c2ecf20Sopenharmony_ci * * %FGP_ACCESSED - The page will be marked accessed.
17718c2ecf20Sopenharmony_ci * * %FGP_LOCK - The page is returned locked.
17728c2ecf20Sopenharmony_ci * * %FGP_HEAD - If the page is present and a THP, return the head page
17738c2ecf20Sopenharmony_ci *   rather than the exact page specified by the index.
17748c2ecf20Sopenharmony_ci * * %FGP_CREAT - If no page is present then a new page is allocated using
17758c2ecf20Sopenharmony_ci *   @gfp_mask and added to the page cache and the VM's LRU list.
17768c2ecf20Sopenharmony_ci *   The page is returned locked and with an increased refcount.
17778c2ecf20Sopenharmony_ci * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
17788c2ecf20Sopenharmony_ci *   page is already in cache.  If the page was allocated, unlock it before
17798c2ecf20Sopenharmony_ci *   returning so the caller can do the same dance.
17808c2ecf20Sopenharmony_ci * * %FGP_WRITE - The page will be written
17818c2ecf20Sopenharmony_ci * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
17828c2ecf20Sopenharmony_ci * * %FGP_NOWAIT - Don't get blocked by page lock
17838c2ecf20Sopenharmony_ci *
17848c2ecf20Sopenharmony_ci * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
17858c2ecf20Sopenharmony_ci * if the %GFP flags specified for %FGP_CREAT are atomic.
17868c2ecf20Sopenharmony_ci *
17878c2ecf20Sopenharmony_ci * If there is a page cache page, it is returned with an increased refcount.
17888c2ecf20Sopenharmony_ci *
17898c2ecf20Sopenharmony_ci * Return: The found page or %NULL otherwise.
17908c2ecf20Sopenharmony_ci */
17918c2ecf20Sopenharmony_cistruct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
17928c2ecf20Sopenharmony_ci		int fgp_flags, gfp_t gfp_mask)
17938c2ecf20Sopenharmony_ci{
17948c2ecf20Sopenharmony_ci	struct page *page;
17958c2ecf20Sopenharmony_ci
17968c2ecf20Sopenharmony_cirepeat:
17978c2ecf20Sopenharmony_ci	page = find_get_entry(mapping, index);
17988c2ecf20Sopenharmony_ci	if (xa_is_value(page))
17998c2ecf20Sopenharmony_ci		page = NULL;
18008c2ecf20Sopenharmony_ci	if (!page)
18018c2ecf20Sopenharmony_ci		goto no_page;
18028c2ecf20Sopenharmony_ci
18038c2ecf20Sopenharmony_ci	if (fgp_flags & FGP_LOCK) {
18048c2ecf20Sopenharmony_ci		if (fgp_flags & FGP_NOWAIT) {
18058c2ecf20Sopenharmony_ci			if (!trylock_page(page)) {
18068c2ecf20Sopenharmony_ci				put_page(page);
18078c2ecf20Sopenharmony_ci				return NULL;
18088c2ecf20Sopenharmony_ci			}
18098c2ecf20Sopenharmony_ci		} else {
18108c2ecf20Sopenharmony_ci			lock_page(page);
18118c2ecf20Sopenharmony_ci		}
18128c2ecf20Sopenharmony_ci
18138c2ecf20Sopenharmony_ci		/* Has the page been truncated? */
18148c2ecf20Sopenharmony_ci		if (unlikely(page->mapping != mapping)) {
18158c2ecf20Sopenharmony_ci			unlock_page(page);
18168c2ecf20Sopenharmony_ci			put_page(page);
18178c2ecf20Sopenharmony_ci			goto repeat;
18188c2ecf20Sopenharmony_ci		}
18198c2ecf20Sopenharmony_ci		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
18208c2ecf20Sopenharmony_ci	}
18218c2ecf20Sopenharmony_ci
18228c2ecf20Sopenharmony_ci	if (fgp_flags & FGP_ACCESSED)
18238c2ecf20Sopenharmony_ci		mark_page_accessed(page);
18248c2ecf20Sopenharmony_ci	else if (fgp_flags & FGP_WRITE) {
18258c2ecf20Sopenharmony_ci		/* Clear idle flag for buffer write */
18268c2ecf20Sopenharmony_ci		if (page_is_idle(page))
18278c2ecf20Sopenharmony_ci			clear_page_idle(page);
18288c2ecf20Sopenharmony_ci	}
18298c2ecf20Sopenharmony_ci	if (!(fgp_flags & FGP_HEAD))
18308c2ecf20Sopenharmony_ci		page = find_subpage(page, index);
18318c2ecf20Sopenharmony_ci
18328c2ecf20Sopenharmony_cino_page:
18338c2ecf20Sopenharmony_ci	if (!page && (fgp_flags & FGP_CREAT)) {
18348c2ecf20Sopenharmony_ci		int err;
18358c2ecf20Sopenharmony_ci		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
18368c2ecf20Sopenharmony_ci			gfp_mask |= __GFP_WRITE;
18378c2ecf20Sopenharmony_ci		if (fgp_flags & FGP_NOFS)
18388c2ecf20Sopenharmony_ci			gfp_mask &= ~__GFP_FS;
18398c2ecf20Sopenharmony_ci
18408c2ecf20Sopenharmony_ci		page = __page_cache_alloc(gfp_mask);
18418c2ecf20Sopenharmony_ci		if (!page)
18428c2ecf20Sopenharmony_ci			return NULL;
18438c2ecf20Sopenharmony_ci
18448c2ecf20Sopenharmony_ci		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
18458c2ecf20Sopenharmony_ci			fgp_flags |= FGP_LOCK;
18468c2ecf20Sopenharmony_ci
18478c2ecf20Sopenharmony_ci		/* Init accessed so avoid atomic mark_page_accessed later */
18488c2ecf20Sopenharmony_ci		if (fgp_flags & FGP_ACCESSED)
18498c2ecf20Sopenharmony_ci			__SetPageReferenced(page);
18508c2ecf20Sopenharmony_ci
18518c2ecf20Sopenharmony_ci		err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
18528c2ecf20Sopenharmony_ci		if (unlikely(err)) {
18538c2ecf20Sopenharmony_ci			put_page(page);
18548c2ecf20Sopenharmony_ci			page = NULL;
18558c2ecf20Sopenharmony_ci			if (err == -EEXIST)
18568c2ecf20Sopenharmony_ci				goto repeat;
18578c2ecf20Sopenharmony_ci		}
18588c2ecf20Sopenharmony_ci
18598c2ecf20Sopenharmony_ci		/*
18608c2ecf20Sopenharmony_ci		 * add_to_page_cache_lru locks the page, and for mmap we expect
18618c2ecf20Sopenharmony_ci		 * an unlocked page.
18628c2ecf20Sopenharmony_ci		 */
18638c2ecf20Sopenharmony_ci		if (page && (fgp_flags & FGP_FOR_MMAP))
18648c2ecf20Sopenharmony_ci			unlock_page(page);
18658c2ecf20Sopenharmony_ci	}
18668c2ecf20Sopenharmony_ci
18678c2ecf20Sopenharmony_ci	return page;
18688c2ecf20Sopenharmony_ci}
18698c2ecf20Sopenharmony_ciEXPORT_SYMBOL(pagecache_get_page);
18708c2ecf20Sopenharmony_ci
18718c2ecf20Sopenharmony_ci/**
18728c2ecf20Sopenharmony_ci * find_get_entries - gang pagecache lookup
18738c2ecf20Sopenharmony_ci * @mapping:	The address_space to search
18748c2ecf20Sopenharmony_ci * @start:	The starting page cache index
18758c2ecf20Sopenharmony_ci * @nr_entries:	The maximum number of entries
18768c2ecf20Sopenharmony_ci * @entries:	Where the resulting entries are placed
18778c2ecf20Sopenharmony_ci * @indices:	The cache indices corresponding to the entries in @entries
18788c2ecf20Sopenharmony_ci *
18798c2ecf20Sopenharmony_ci * find_get_entries() will search for and return a group of up to
18808c2ecf20Sopenharmony_ci * @nr_entries entries in the mapping.  The entries are placed at
18818c2ecf20Sopenharmony_ci * @entries.  find_get_entries() takes a reference against any actual
18828c2ecf20Sopenharmony_ci * pages it returns.
18838c2ecf20Sopenharmony_ci *
18848c2ecf20Sopenharmony_ci * The search returns a group of mapping-contiguous page cache entries
18858c2ecf20Sopenharmony_ci * with ascending indexes.  There may be holes in the indices due to
18868c2ecf20Sopenharmony_ci * not-present pages.
18878c2ecf20Sopenharmony_ci *
18888c2ecf20Sopenharmony_ci * Any shadow entries of evicted pages, or swap entries from
18898c2ecf20Sopenharmony_ci * shmem/tmpfs, are included in the returned array.
18908c2ecf20Sopenharmony_ci *
18918c2ecf20Sopenharmony_ci * If it finds a Transparent Huge Page, head or tail, find_get_entries()
18928c2ecf20Sopenharmony_ci * stops at that page: the caller is likely to have a better way to handle
18938c2ecf20Sopenharmony_ci * the compound page as a whole, and then skip its extent, than repeatedly
18948c2ecf20Sopenharmony_ci * calling find_get_entries() to return all its tails.
18958c2ecf20Sopenharmony_ci *
18968c2ecf20Sopenharmony_ci * Return: the number of pages and shadow entries which were found.
18978c2ecf20Sopenharmony_ci */
18988c2ecf20Sopenharmony_ciunsigned find_get_entries(struct address_space *mapping,
18998c2ecf20Sopenharmony_ci			  pgoff_t start, unsigned int nr_entries,
19008c2ecf20Sopenharmony_ci			  struct page **entries, pgoff_t *indices)
19018c2ecf20Sopenharmony_ci{
19028c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, start);
19038c2ecf20Sopenharmony_ci	struct page *page;
19048c2ecf20Sopenharmony_ci	unsigned int ret = 0;
19058c2ecf20Sopenharmony_ci
19068c2ecf20Sopenharmony_ci	if (!nr_entries)
19078c2ecf20Sopenharmony_ci		return 0;
19088c2ecf20Sopenharmony_ci
19098c2ecf20Sopenharmony_ci	rcu_read_lock();
19108c2ecf20Sopenharmony_ci	xas_for_each(&xas, page, ULONG_MAX) {
19118c2ecf20Sopenharmony_ci		if (xas_retry(&xas, page))
19128c2ecf20Sopenharmony_ci			continue;
19138c2ecf20Sopenharmony_ci		/*
19148c2ecf20Sopenharmony_ci		 * A shadow entry of a recently evicted page, a swap
19158c2ecf20Sopenharmony_ci		 * entry from shmem/tmpfs or a DAX entry.  Return it
19168c2ecf20Sopenharmony_ci		 * without attempting to raise page count.
19178c2ecf20Sopenharmony_ci		 */
19188c2ecf20Sopenharmony_ci		if (xa_is_value(page))
19198c2ecf20Sopenharmony_ci			goto export;
19208c2ecf20Sopenharmony_ci
19218c2ecf20Sopenharmony_ci		if (!page_cache_get_speculative(page))
19228c2ecf20Sopenharmony_ci			goto retry;
19238c2ecf20Sopenharmony_ci
19248c2ecf20Sopenharmony_ci		/* Has the page moved or been split? */
19258c2ecf20Sopenharmony_ci		if (unlikely(page != xas_reload(&xas)))
19268c2ecf20Sopenharmony_ci			goto put_page;
19278c2ecf20Sopenharmony_ci
19288c2ecf20Sopenharmony_ci		/*
19298c2ecf20Sopenharmony_ci		 * Terminate early on finding a THP, to allow the caller to
19308c2ecf20Sopenharmony_ci		 * handle it all at once; but continue if this is hugetlbfs.
19318c2ecf20Sopenharmony_ci		 */
19328c2ecf20Sopenharmony_ci		if (PageTransHuge(page) && !PageHuge(page)) {
19338c2ecf20Sopenharmony_ci			page = find_subpage(page, xas.xa_index);
19348c2ecf20Sopenharmony_ci			nr_entries = ret + 1;
19358c2ecf20Sopenharmony_ci		}
19368c2ecf20Sopenharmony_ciexport:
19378c2ecf20Sopenharmony_ci		indices[ret] = xas.xa_index;
19388c2ecf20Sopenharmony_ci		entries[ret] = page;
19398c2ecf20Sopenharmony_ci		if (++ret == nr_entries)
19408c2ecf20Sopenharmony_ci			break;
19418c2ecf20Sopenharmony_ci		continue;
19428c2ecf20Sopenharmony_ciput_page:
19438c2ecf20Sopenharmony_ci		put_page(page);
19448c2ecf20Sopenharmony_ciretry:
19458c2ecf20Sopenharmony_ci		xas_reset(&xas);
19468c2ecf20Sopenharmony_ci	}
19478c2ecf20Sopenharmony_ci	rcu_read_unlock();
19488c2ecf20Sopenharmony_ci	return ret;
19498c2ecf20Sopenharmony_ci}
19508c2ecf20Sopenharmony_ci
19518c2ecf20Sopenharmony_ci/**
19528c2ecf20Sopenharmony_ci * find_get_pages_range - gang pagecache lookup
19538c2ecf20Sopenharmony_ci * @mapping:	The address_space to search
19548c2ecf20Sopenharmony_ci * @start:	The starting page index
19558c2ecf20Sopenharmony_ci * @end:	The final page index (inclusive)
19568c2ecf20Sopenharmony_ci * @nr_pages:	The maximum number of pages
19578c2ecf20Sopenharmony_ci * @pages:	Where the resulting pages are placed
19588c2ecf20Sopenharmony_ci *
19598c2ecf20Sopenharmony_ci * find_get_pages_range() will search for and return a group of up to @nr_pages
19608c2ecf20Sopenharmony_ci * pages in the mapping starting at index @start and up to index @end
19618c2ecf20Sopenharmony_ci * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
19628c2ecf20Sopenharmony_ci * a reference against the returned pages.
19638c2ecf20Sopenharmony_ci *
19648c2ecf20Sopenharmony_ci * The search returns a group of mapping-contiguous pages with ascending
19658c2ecf20Sopenharmony_ci * indexes.  There may be holes in the indices due to not-present pages.
19668c2ecf20Sopenharmony_ci * We also update @start to index the next page for the traversal.
19678c2ecf20Sopenharmony_ci *
19688c2ecf20Sopenharmony_ci * Return: the number of pages which were found. If this number is
19698c2ecf20Sopenharmony_ci * smaller than @nr_pages, the end of specified range has been
19708c2ecf20Sopenharmony_ci * reached.
19718c2ecf20Sopenharmony_ci */
19728c2ecf20Sopenharmony_ciunsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
19738c2ecf20Sopenharmony_ci			      pgoff_t end, unsigned int nr_pages,
19748c2ecf20Sopenharmony_ci			      struct page **pages)
19758c2ecf20Sopenharmony_ci{
19768c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, *start);
19778c2ecf20Sopenharmony_ci	struct page *page;
19788c2ecf20Sopenharmony_ci	unsigned ret = 0;
19798c2ecf20Sopenharmony_ci
19808c2ecf20Sopenharmony_ci	if (unlikely(!nr_pages))
19818c2ecf20Sopenharmony_ci		return 0;
19828c2ecf20Sopenharmony_ci
19838c2ecf20Sopenharmony_ci	rcu_read_lock();
19848c2ecf20Sopenharmony_ci	xas_for_each(&xas, page, end) {
19858c2ecf20Sopenharmony_ci		if (xas_retry(&xas, page))
19868c2ecf20Sopenharmony_ci			continue;
19878c2ecf20Sopenharmony_ci		/* Skip over shadow, swap and DAX entries */
19888c2ecf20Sopenharmony_ci		if (xa_is_value(page))
19898c2ecf20Sopenharmony_ci			continue;
19908c2ecf20Sopenharmony_ci
19918c2ecf20Sopenharmony_ci		if (!page_cache_get_speculative(page))
19928c2ecf20Sopenharmony_ci			goto retry;
19938c2ecf20Sopenharmony_ci
19948c2ecf20Sopenharmony_ci		/* Has the page moved or been split? */
19958c2ecf20Sopenharmony_ci		if (unlikely(page != xas_reload(&xas)))
19968c2ecf20Sopenharmony_ci			goto put_page;
19978c2ecf20Sopenharmony_ci
19988c2ecf20Sopenharmony_ci		pages[ret] = find_subpage(page, xas.xa_index);
19998c2ecf20Sopenharmony_ci		if (++ret == nr_pages) {
20008c2ecf20Sopenharmony_ci			*start = xas.xa_index + 1;
20018c2ecf20Sopenharmony_ci			goto out;
20028c2ecf20Sopenharmony_ci		}
20038c2ecf20Sopenharmony_ci		continue;
20048c2ecf20Sopenharmony_ciput_page:
20058c2ecf20Sopenharmony_ci		put_page(page);
20068c2ecf20Sopenharmony_ciretry:
20078c2ecf20Sopenharmony_ci		xas_reset(&xas);
20088c2ecf20Sopenharmony_ci	}
20098c2ecf20Sopenharmony_ci
20108c2ecf20Sopenharmony_ci	/*
20118c2ecf20Sopenharmony_ci	 * We come here when there is no page beyond @end. We take care to not
20128c2ecf20Sopenharmony_ci	 * overflow the index @start as it confuses some of the callers. This
20138c2ecf20Sopenharmony_ci	 * breaks the iteration when there is a page at index -1 but that is
20148c2ecf20Sopenharmony_ci	 * already broken anyway.
20158c2ecf20Sopenharmony_ci	 */
20168c2ecf20Sopenharmony_ci	if (end == (pgoff_t)-1)
20178c2ecf20Sopenharmony_ci		*start = (pgoff_t)-1;
20188c2ecf20Sopenharmony_ci	else
20198c2ecf20Sopenharmony_ci		*start = end + 1;
20208c2ecf20Sopenharmony_ciout:
20218c2ecf20Sopenharmony_ci	rcu_read_unlock();
20228c2ecf20Sopenharmony_ci
20238c2ecf20Sopenharmony_ci	return ret;
20248c2ecf20Sopenharmony_ci}
20258c2ecf20Sopenharmony_ci
20268c2ecf20Sopenharmony_ci/**
20278c2ecf20Sopenharmony_ci * find_get_pages_contig - gang contiguous pagecache lookup
20288c2ecf20Sopenharmony_ci * @mapping:	The address_space to search
20298c2ecf20Sopenharmony_ci * @index:	The starting page index
20308c2ecf20Sopenharmony_ci * @nr_pages:	The maximum number of pages
20318c2ecf20Sopenharmony_ci * @pages:	Where the resulting pages are placed
20328c2ecf20Sopenharmony_ci *
20338c2ecf20Sopenharmony_ci * find_get_pages_contig() works exactly like find_get_pages(), except
20348c2ecf20Sopenharmony_ci * that the returned number of pages are guaranteed to be contiguous.
20358c2ecf20Sopenharmony_ci *
20368c2ecf20Sopenharmony_ci * Return: the number of pages which were found.
20378c2ecf20Sopenharmony_ci */
20388c2ecf20Sopenharmony_ciunsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
20398c2ecf20Sopenharmony_ci			       unsigned int nr_pages, struct page **pages)
20408c2ecf20Sopenharmony_ci{
20418c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, index);
20428c2ecf20Sopenharmony_ci	struct page *page;
20438c2ecf20Sopenharmony_ci	unsigned int ret = 0;
20448c2ecf20Sopenharmony_ci
20458c2ecf20Sopenharmony_ci	if (unlikely(!nr_pages))
20468c2ecf20Sopenharmony_ci		return 0;
20478c2ecf20Sopenharmony_ci
20488c2ecf20Sopenharmony_ci	rcu_read_lock();
20498c2ecf20Sopenharmony_ci	for (page = xas_load(&xas); page; page = xas_next(&xas)) {
20508c2ecf20Sopenharmony_ci		if (xas_retry(&xas, page))
20518c2ecf20Sopenharmony_ci			continue;
20528c2ecf20Sopenharmony_ci		/*
20538c2ecf20Sopenharmony_ci		 * If the entry has been swapped out, we can stop looking.
20548c2ecf20Sopenharmony_ci		 * No current caller is looking for DAX entries.
20558c2ecf20Sopenharmony_ci		 */
20568c2ecf20Sopenharmony_ci		if (xa_is_value(page))
20578c2ecf20Sopenharmony_ci			break;
20588c2ecf20Sopenharmony_ci
20598c2ecf20Sopenharmony_ci		if (!page_cache_get_speculative(page))
20608c2ecf20Sopenharmony_ci			goto retry;
20618c2ecf20Sopenharmony_ci
20628c2ecf20Sopenharmony_ci		/* Has the page moved or been split? */
20638c2ecf20Sopenharmony_ci		if (unlikely(page != xas_reload(&xas)))
20648c2ecf20Sopenharmony_ci			goto put_page;
20658c2ecf20Sopenharmony_ci
20668c2ecf20Sopenharmony_ci		pages[ret] = find_subpage(page, xas.xa_index);
20678c2ecf20Sopenharmony_ci		if (++ret == nr_pages)
20688c2ecf20Sopenharmony_ci			break;
20698c2ecf20Sopenharmony_ci		continue;
20708c2ecf20Sopenharmony_ciput_page:
20718c2ecf20Sopenharmony_ci		put_page(page);
20728c2ecf20Sopenharmony_ciretry:
20738c2ecf20Sopenharmony_ci		xas_reset(&xas);
20748c2ecf20Sopenharmony_ci	}
20758c2ecf20Sopenharmony_ci	rcu_read_unlock();
20768c2ecf20Sopenharmony_ci	return ret;
20778c2ecf20Sopenharmony_ci}
20788c2ecf20Sopenharmony_ciEXPORT_SYMBOL(find_get_pages_contig);
20798c2ecf20Sopenharmony_ci
20808c2ecf20Sopenharmony_ci/**
20818c2ecf20Sopenharmony_ci * find_get_pages_range_tag - find and return pages in given range matching @tag
20828c2ecf20Sopenharmony_ci * @mapping:	the address_space to search
20838c2ecf20Sopenharmony_ci * @index:	the starting page index
20848c2ecf20Sopenharmony_ci * @end:	The final page index (inclusive)
20858c2ecf20Sopenharmony_ci * @tag:	the tag index
20868c2ecf20Sopenharmony_ci * @nr_pages:	the maximum number of pages
20878c2ecf20Sopenharmony_ci * @pages:	where the resulting pages are placed
20888c2ecf20Sopenharmony_ci *
20898c2ecf20Sopenharmony_ci * Like find_get_pages, except we only return pages which are tagged with
20908c2ecf20Sopenharmony_ci * @tag.   We update @index to index the next page for the traversal.
20918c2ecf20Sopenharmony_ci *
20928c2ecf20Sopenharmony_ci * Return: the number of pages which were found.
20938c2ecf20Sopenharmony_ci */
20948c2ecf20Sopenharmony_ciunsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
20958c2ecf20Sopenharmony_ci			pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
20968c2ecf20Sopenharmony_ci			struct page **pages)
20978c2ecf20Sopenharmony_ci{
20988c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, *index);
20998c2ecf20Sopenharmony_ci	struct page *page;
21008c2ecf20Sopenharmony_ci	unsigned ret = 0;
21018c2ecf20Sopenharmony_ci
21028c2ecf20Sopenharmony_ci	if (unlikely(!nr_pages))
21038c2ecf20Sopenharmony_ci		return 0;
21048c2ecf20Sopenharmony_ci
21058c2ecf20Sopenharmony_ci	rcu_read_lock();
21068c2ecf20Sopenharmony_ci	xas_for_each_marked(&xas, page, end, tag) {
21078c2ecf20Sopenharmony_ci		if (xas_retry(&xas, page))
21088c2ecf20Sopenharmony_ci			continue;
21098c2ecf20Sopenharmony_ci		/*
21108c2ecf20Sopenharmony_ci		 * Shadow entries should never be tagged, but this iteration
21118c2ecf20Sopenharmony_ci		 * is lockless so there is a window for page reclaim to evict
21128c2ecf20Sopenharmony_ci		 * a page we saw tagged.  Skip over it.
21138c2ecf20Sopenharmony_ci		 */
21148c2ecf20Sopenharmony_ci		if (xa_is_value(page))
21158c2ecf20Sopenharmony_ci			continue;
21168c2ecf20Sopenharmony_ci
21178c2ecf20Sopenharmony_ci		if (!page_cache_get_speculative(page))
21188c2ecf20Sopenharmony_ci			goto retry;
21198c2ecf20Sopenharmony_ci
21208c2ecf20Sopenharmony_ci		/* Has the page moved or been split? */
21218c2ecf20Sopenharmony_ci		if (unlikely(page != xas_reload(&xas)))
21228c2ecf20Sopenharmony_ci			goto put_page;
21238c2ecf20Sopenharmony_ci
21248c2ecf20Sopenharmony_ci		pages[ret] = find_subpage(page, xas.xa_index);
21258c2ecf20Sopenharmony_ci		if (++ret == nr_pages) {
21268c2ecf20Sopenharmony_ci			*index = xas.xa_index + 1;
21278c2ecf20Sopenharmony_ci			goto out;
21288c2ecf20Sopenharmony_ci		}
21298c2ecf20Sopenharmony_ci		continue;
21308c2ecf20Sopenharmony_ciput_page:
21318c2ecf20Sopenharmony_ci		put_page(page);
21328c2ecf20Sopenharmony_ciretry:
21338c2ecf20Sopenharmony_ci		xas_reset(&xas);
21348c2ecf20Sopenharmony_ci	}
21358c2ecf20Sopenharmony_ci
21368c2ecf20Sopenharmony_ci	/*
21378c2ecf20Sopenharmony_ci	 * We come here when we got to @end. We take care to not overflow the
21388c2ecf20Sopenharmony_ci	 * index @index as it confuses some of the callers. This breaks the
21398c2ecf20Sopenharmony_ci	 * iteration when there is a page at index -1 but that is already
21408c2ecf20Sopenharmony_ci	 * broken anyway.
21418c2ecf20Sopenharmony_ci	 */
21428c2ecf20Sopenharmony_ci	if (end == (pgoff_t)-1)
21438c2ecf20Sopenharmony_ci		*index = (pgoff_t)-1;
21448c2ecf20Sopenharmony_ci	else
21458c2ecf20Sopenharmony_ci		*index = end + 1;
21468c2ecf20Sopenharmony_ciout:
21478c2ecf20Sopenharmony_ci	rcu_read_unlock();
21488c2ecf20Sopenharmony_ci
21498c2ecf20Sopenharmony_ci	return ret;
21508c2ecf20Sopenharmony_ci}
21518c2ecf20Sopenharmony_ciEXPORT_SYMBOL(find_get_pages_range_tag);
21528c2ecf20Sopenharmony_ci
21538c2ecf20Sopenharmony_ci/*
21548c2ecf20Sopenharmony_ci * CD/DVDs are error prone. When a medium error occurs, the driver may fail
21558c2ecf20Sopenharmony_ci * a _large_ part of the i/o request. Imagine the worst scenario:
21568c2ecf20Sopenharmony_ci *
21578c2ecf20Sopenharmony_ci *      ---R__________________________________________B__________
21588c2ecf20Sopenharmony_ci *         ^ reading here                             ^ bad block(assume 4k)
21598c2ecf20Sopenharmony_ci *
21608c2ecf20Sopenharmony_ci * read(R) => miss => readahead(R...B) => media error => frustrating retries
21618c2ecf20Sopenharmony_ci * => failing the whole request => read(R) => read(R+1) =>
21628c2ecf20Sopenharmony_ci * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
21638c2ecf20Sopenharmony_ci * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
21648c2ecf20Sopenharmony_ci * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
21658c2ecf20Sopenharmony_ci *
21668c2ecf20Sopenharmony_ci * It is going insane. Fix it by quickly scaling down the readahead size.
21678c2ecf20Sopenharmony_ci */
21688c2ecf20Sopenharmony_cistatic void shrink_readahead_size_eio(struct file_ra_state *ra)
21698c2ecf20Sopenharmony_ci{
21708c2ecf20Sopenharmony_ci	ra->ra_pages /= 4;
21718c2ecf20Sopenharmony_ci}
21728c2ecf20Sopenharmony_ci
21738c2ecf20Sopenharmony_ci/**
21748c2ecf20Sopenharmony_ci * generic_file_buffered_read - generic file read routine
21758c2ecf20Sopenharmony_ci * @iocb:	the iocb to read
21768c2ecf20Sopenharmony_ci * @iter:	data destination
21778c2ecf20Sopenharmony_ci * @written:	already copied
21788c2ecf20Sopenharmony_ci *
21798c2ecf20Sopenharmony_ci * This is a generic file read routine, and uses the
21808c2ecf20Sopenharmony_ci * mapping->a_ops->readpage() function for the actual low-level stuff.
21818c2ecf20Sopenharmony_ci *
21828c2ecf20Sopenharmony_ci * This is really ugly. But the goto's actually try to clarify some
21838c2ecf20Sopenharmony_ci * of the logic when it comes to error handling etc.
21848c2ecf20Sopenharmony_ci *
21858c2ecf20Sopenharmony_ci * Return:
21868c2ecf20Sopenharmony_ci * * total number of bytes copied, including those the were already @written
21878c2ecf20Sopenharmony_ci * * negative error code if nothing was copied
21888c2ecf20Sopenharmony_ci */
21898c2ecf20Sopenharmony_cissize_t generic_file_buffered_read(struct kiocb *iocb,
21908c2ecf20Sopenharmony_ci		struct iov_iter *iter, ssize_t written)
21918c2ecf20Sopenharmony_ci{
21928c2ecf20Sopenharmony_ci	struct file *filp = iocb->ki_filp;
21938c2ecf20Sopenharmony_ci	struct address_space *mapping = filp->f_mapping;
21948c2ecf20Sopenharmony_ci	struct inode *inode = mapping->host;
21958c2ecf20Sopenharmony_ci	struct file_ra_state *ra = &filp->f_ra;
21968c2ecf20Sopenharmony_ci	loff_t *ppos = &iocb->ki_pos;
21978c2ecf20Sopenharmony_ci	pgoff_t index;
21988c2ecf20Sopenharmony_ci	pgoff_t last_index;
21998c2ecf20Sopenharmony_ci	pgoff_t prev_index;
22008c2ecf20Sopenharmony_ci	unsigned long offset;      /* offset into pagecache page */
22018c2ecf20Sopenharmony_ci	unsigned int prev_offset;
22028c2ecf20Sopenharmony_ci	int error = 0;
22038c2ecf20Sopenharmony_ci
22048c2ecf20Sopenharmony_ci	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
22058c2ecf20Sopenharmony_ci		return 0;
22068c2ecf20Sopenharmony_ci	if (unlikely(!iov_iter_count(iter)))
22078c2ecf20Sopenharmony_ci		return 0;
22088c2ecf20Sopenharmony_ci
22098c2ecf20Sopenharmony_ci	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
22108c2ecf20Sopenharmony_ci
22118c2ecf20Sopenharmony_ci	index = *ppos >> PAGE_SHIFT;
22128c2ecf20Sopenharmony_ci	prev_index = ra->prev_pos >> PAGE_SHIFT;
22138c2ecf20Sopenharmony_ci	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
22148c2ecf20Sopenharmony_ci	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
22158c2ecf20Sopenharmony_ci	offset = *ppos & ~PAGE_MASK;
22168c2ecf20Sopenharmony_ci
22178c2ecf20Sopenharmony_ci	/*
22188c2ecf20Sopenharmony_ci	 * If we've already successfully copied some data, then we
22198c2ecf20Sopenharmony_ci	 * can no longer safely return -EIOCBQUEUED. Hence mark
22208c2ecf20Sopenharmony_ci	 * an async read NOWAIT at that point.
22218c2ecf20Sopenharmony_ci	 */
22228c2ecf20Sopenharmony_ci	if (written && (iocb->ki_flags & IOCB_WAITQ))
22238c2ecf20Sopenharmony_ci		iocb->ki_flags |= IOCB_NOWAIT;
22248c2ecf20Sopenharmony_ci
22258c2ecf20Sopenharmony_ci	for (;;) {
22268c2ecf20Sopenharmony_ci		struct page *page;
22278c2ecf20Sopenharmony_ci		pgoff_t end_index;
22288c2ecf20Sopenharmony_ci		loff_t isize;
22298c2ecf20Sopenharmony_ci		unsigned long nr, ret;
22308c2ecf20Sopenharmony_ci
22318c2ecf20Sopenharmony_ci		cond_resched();
22328c2ecf20Sopenharmony_cifind_page:
22338c2ecf20Sopenharmony_ci		if (fatal_signal_pending(current)) {
22348c2ecf20Sopenharmony_ci			error = -EINTR;
22358c2ecf20Sopenharmony_ci			goto out;
22368c2ecf20Sopenharmony_ci		}
22378c2ecf20Sopenharmony_ci
22388c2ecf20Sopenharmony_ci		page = find_get_page(mapping, index);
22398c2ecf20Sopenharmony_ci		if (!page) {
22408c2ecf20Sopenharmony_ci			if (iocb->ki_flags & IOCB_NOIO)
22418c2ecf20Sopenharmony_ci				goto would_block;
22428c2ecf20Sopenharmony_ci			page_cache_sync_readahead(mapping,
22438c2ecf20Sopenharmony_ci					ra, filp,
22448c2ecf20Sopenharmony_ci					index, last_index - index);
22458c2ecf20Sopenharmony_ci			page = find_get_page(mapping, index);
22468c2ecf20Sopenharmony_ci			if (unlikely(page == NULL))
22478c2ecf20Sopenharmony_ci				goto no_cached_page;
22488c2ecf20Sopenharmony_ci		}
22498c2ecf20Sopenharmony_ci		if (PageReadahead(page)) {
22508c2ecf20Sopenharmony_ci			if (iocb->ki_flags & IOCB_NOIO) {
22518c2ecf20Sopenharmony_ci				put_page(page);
22528c2ecf20Sopenharmony_ci				goto out;
22538c2ecf20Sopenharmony_ci			}
22548c2ecf20Sopenharmony_ci			page_cache_async_readahead(mapping,
22558c2ecf20Sopenharmony_ci					ra, filp, page,
22568c2ecf20Sopenharmony_ci					index, last_index - index);
22578c2ecf20Sopenharmony_ci		}
22588c2ecf20Sopenharmony_ci		if (!PageUptodate(page)) {
22598c2ecf20Sopenharmony_ci			/*
22608c2ecf20Sopenharmony_ci			 * See comment in do_read_cache_page on why
22618c2ecf20Sopenharmony_ci			 * wait_on_page_locked is used to avoid unnecessarily
22628c2ecf20Sopenharmony_ci			 * serialisations and why it's safe.
22638c2ecf20Sopenharmony_ci			 */
22648c2ecf20Sopenharmony_ci			if (iocb->ki_flags & IOCB_WAITQ) {
22658c2ecf20Sopenharmony_ci				if (written) {
22668c2ecf20Sopenharmony_ci					put_page(page);
22678c2ecf20Sopenharmony_ci					goto out;
22688c2ecf20Sopenharmony_ci				}
22698c2ecf20Sopenharmony_ci				error = wait_on_page_locked_async(page,
22708c2ecf20Sopenharmony_ci								iocb->ki_waitq);
22718c2ecf20Sopenharmony_ci			} else {
22728c2ecf20Sopenharmony_ci				if (iocb->ki_flags & IOCB_NOWAIT) {
22738c2ecf20Sopenharmony_ci					put_page(page);
22748c2ecf20Sopenharmony_ci					goto would_block;
22758c2ecf20Sopenharmony_ci				}
22768c2ecf20Sopenharmony_ci				error = wait_on_page_locked_killable(page);
22778c2ecf20Sopenharmony_ci			}
22788c2ecf20Sopenharmony_ci			if (unlikely(error))
22798c2ecf20Sopenharmony_ci				goto readpage_error;
22808c2ecf20Sopenharmony_ci			if (PageUptodate(page))
22818c2ecf20Sopenharmony_ci				goto page_ok;
22828c2ecf20Sopenharmony_ci
22838c2ecf20Sopenharmony_ci			if (inode->i_blkbits == PAGE_SHIFT ||
22848c2ecf20Sopenharmony_ci					!mapping->a_ops->is_partially_uptodate)
22858c2ecf20Sopenharmony_ci				goto page_not_up_to_date;
22868c2ecf20Sopenharmony_ci			/* pipes can't handle partially uptodate pages */
22878c2ecf20Sopenharmony_ci			if (unlikely(iov_iter_is_pipe(iter)))
22888c2ecf20Sopenharmony_ci				goto page_not_up_to_date;
22898c2ecf20Sopenharmony_ci			if (!trylock_page(page))
22908c2ecf20Sopenharmony_ci				goto page_not_up_to_date;
22918c2ecf20Sopenharmony_ci			/* Did it get truncated before we got the lock? */
22928c2ecf20Sopenharmony_ci			if (!page->mapping)
22938c2ecf20Sopenharmony_ci				goto page_not_up_to_date_locked;
22948c2ecf20Sopenharmony_ci			if (!mapping->a_ops->is_partially_uptodate(page,
22958c2ecf20Sopenharmony_ci							offset, iter->count))
22968c2ecf20Sopenharmony_ci				goto page_not_up_to_date_locked;
22978c2ecf20Sopenharmony_ci			unlock_page(page);
22988c2ecf20Sopenharmony_ci		}
22998c2ecf20Sopenharmony_cipage_ok:
23008c2ecf20Sopenharmony_ci		/*
23018c2ecf20Sopenharmony_ci		 * i_size must be checked after we know the page is Uptodate.
23028c2ecf20Sopenharmony_ci		 *
23038c2ecf20Sopenharmony_ci		 * Checking i_size after the check allows us to calculate
23048c2ecf20Sopenharmony_ci		 * the correct value for "nr", which means the zero-filled
23058c2ecf20Sopenharmony_ci		 * part of the page is not copied back to userspace (unless
23068c2ecf20Sopenharmony_ci		 * another truncate extends the file - this is desired though).
23078c2ecf20Sopenharmony_ci		 */
23088c2ecf20Sopenharmony_ci
23098c2ecf20Sopenharmony_ci		isize = i_size_read(inode);
23108c2ecf20Sopenharmony_ci		end_index = (isize - 1) >> PAGE_SHIFT;
23118c2ecf20Sopenharmony_ci		if (unlikely(!isize || index > end_index)) {
23128c2ecf20Sopenharmony_ci			put_page(page);
23138c2ecf20Sopenharmony_ci			goto out;
23148c2ecf20Sopenharmony_ci		}
23158c2ecf20Sopenharmony_ci
23168c2ecf20Sopenharmony_ci		/* nr is the maximum number of bytes to copy from this page */
23178c2ecf20Sopenharmony_ci		nr = PAGE_SIZE;
23188c2ecf20Sopenharmony_ci		if (index == end_index) {
23198c2ecf20Sopenharmony_ci			nr = ((isize - 1) & ~PAGE_MASK) + 1;
23208c2ecf20Sopenharmony_ci			if (nr <= offset) {
23218c2ecf20Sopenharmony_ci				put_page(page);
23228c2ecf20Sopenharmony_ci				goto out;
23238c2ecf20Sopenharmony_ci			}
23248c2ecf20Sopenharmony_ci		}
23258c2ecf20Sopenharmony_ci		nr = nr - offset;
23268c2ecf20Sopenharmony_ci
23278c2ecf20Sopenharmony_ci		/* If users can be writing to this page using arbitrary
23288c2ecf20Sopenharmony_ci		 * virtual addresses, take care about potential aliasing
23298c2ecf20Sopenharmony_ci		 * before reading the page on the kernel side.
23308c2ecf20Sopenharmony_ci		 */
23318c2ecf20Sopenharmony_ci		if (mapping_writably_mapped(mapping))
23328c2ecf20Sopenharmony_ci			flush_dcache_page(page);
23338c2ecf20Sopenharmony_ci
23348c2ecf20Sopenharmony_ci		/*
23358c2ecf20Sopenharmony_ci		 * When a sequential read accesses a page several times,
23368c2ecf20Sopenharmony_ci		 * only mark it as accessed the first time.
23378c2ecf20Sopenharmony_ci		 */
23388c2ecf20Sopenharmony_ci		if (prev_index != index || offset != prev_offset)
23398c2ecf20Sopenharmony_ci			mark_page_accessed(page);
23408c2ecf20Sopenharmony_ci		prev_index = index;
23418c2ecf20Sopenharmony_ci
23428c2ecf20Sopenharmony_ci		/*
23438c2ecf20Sopenharmony_ci		 * Ok, we have the page, and it's up-to-date, so
23448c2ecf20Sopenharmony_ci		 * now we can copy it to user space...
23458c2ecf20Sopenharmony_ci		 */
23468c2ecf20Sopenharmony_ci
23478c2ecf20Sopenharmony_ci		ret = copy_page_to_iter(page, offset, nr, iter);
23488c2ecf20Sopenharmony_ci		offset += ret;
23498c2ecf20Sopenharmony_ci		index += offset >> PAGE_SHIFT;
23508c2ecf20Sopenharmony_ci		offset &= ~PAGE_MASK;
23518c2ecf20Sopenharmony_ci		prev_offset = offset;
23528c2ecf20Sopenharmony_ci
23538c2ecf20Sopenharmony_ci		put_page(page);
23548c2ecf20Sopenharmony_ci		written += ret;
23558c2ecf20Sopenharmony_ci		if (!iov_iter_count(iter))
23568c2ecf20Sopenharmony_ci			goto out;
23578c2ecf20Sopenharmony_ci		if (ret < nr) {
23588c2ecf20Sopenharmony_ci			error = -EFAULT;
23598c2ecf20Sopenharmony_ci			goto out;
23608c2ecf20Sopenharmony_ci		}
23618c2ecf20Sopenharmony_ci		continue;
23628c2ecf20Sopenharmony_ci
23638c2ecf20Sopenharmony_cipage_not_up_to_date:
23648c2ecf20Sopenharmony_ci		/* Get exclusive access to the page ... */
23658c2ecf20Sopenharmony_ci		if (iocb->ki_flags & IOCB_WAITQ) {
23668c2ecf20Sopenharmony_ci			if (written) {
23678c2ecf20Sopenharmony_ci				put_page(page);
23688c2ecf20Sopenharmony_ci				goto out;
23698c2ecf20Sopenharmony_ci			}
23708c2ecf20Sopenharmony_ci			error = lock_page_async(page, iocb->ki_waitq);
23718c2ecf20Sopenharmony_ci		} else {
23728c2ecf20Sopenharmony_ci			error = lock_page_killable(page);
23738c2ecf20Sopenharmony_ci		}
23748c2ecf20Sopenharmony_ci		if (unlikely(error))
23758c2ecf20Sopenharmony_ci			goto readpage_error;
23768c2ecf20Sopenharmony_ci
23778c2ecf20Sopenharmony_cipage_not_up_to_date_locked:
23788c2ecf20Sopenharmony_ci		/* Did it get truncated before we got the lock? */
23798c2ecf20Sopenharmony_ci		if (!page->mapping) {
23808c2ecf20Sopenharmony_ci			unlock_page(page);
23818c2ecf20Sopenharmony_ci			put_page(page);
23828c2ecf20Sopenharmony_ci			continue;
23838c2ecf20Sopenharmony_ci		}
23848c2ecf20Sopenharmony_ci
23858c2ecf20Sopenharmony_ci		/* Did somebody else fill it already? */
23868c2ecf20Sopenharmony_ci		if (PageUptodate(page)) {
23878c2ecf20Sopenharmony_ci			unlock_page(page);
23888c2ecf20Sopenharmony_ci			goto page_ok;
23898c2ecf20Sopenharmony_ci		}
23908c2ecf20Sopenharmony_ci
23918c2ecf20Sopenharmony_cireadpage:
23928c2ecf20Sopenharmony_ci		if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
23938c2ecf20Sopenharmony_ci			unlock_page(page);
23948c2ecf20Sopenharmony_ci			put_page(page);
23958c2ecf20Sopenharmony_ci			goto would_block;
23968c2ecf20Sopenharmony_ci		}
23978c2ecf20Sopenharmony_ci		/*
23988c2ecf20Sopenharmony_ci		 * A previous I/O error may have been due to temporary
23998c2ecf20Sopenharmony_ci		 * failures, eg. multipath errors.
24008c2ecf20Sopenharmony_ci		 * PG_error will be set again if readpage fails.
24018c2ecf20Sopenharmony_ci		 */
24028c2ecf20Sopenharmony_ci		ClearPageError(page);
24038c2ecf20Sopenharmony_ci		/* Start the actual read. The read will unlock the page. */
24048c2ecf20Sopenharmony_ci		error = mapping->a_ops->readpage(filp, page);
24058c2ecf20Sopenharmony_ci
24068c2ecf20Sopenharmony_ci		if (unlikely(error)) {
24078c2ecf20Sopenharmony_ci			if (error == AOP_TRUNCATED_PAGE) {
24088c2ecf20Sopenharmony_ci				put_page(page);
24098c2ecf20Sopenharmony_ci				error = 0;
24108c2ecf20Sopenharmony_ci				goto find_page;
24118c2ecf20Sopenharmony_ci			}
24128c2ecf20Sopenharmony_ci			goto readpage_error;
24138c2ecf20Sopenharmony_ci		}
24148c2ecf20Sopenharmony_ci
24158c2ecf20Sopenharmony_ci		if (!PageUptodate(page)) {
24168c2ecf20Sopenharmony_ci			if (iocb->ki_flags & IOCB_WAITQ) {
24178c2ecf20Sopenharmony_ci				if (written) {
24188c2ecf20Sopenharmony_ci					put_page(page);
24198c2ecf20Sopenharmony_ci					goto out;
24208c2ecf20Sopenharmony_ci				}
24218c2ecf20Sopenharmony_ci				error = lock_page_async(page, iocb->ki_waitq);
24228c2ecf20Sopenharmony_ci			} else {
24238c2ecf20Sopenharmony_ci				error = lock_page_killable(page);
24248c2ecf20Sopenharmony_ci			}
24258c2ecf20Sopenharmony_ci
24268c2ecf20Sopenharmony_ci			if (unlikely(error))
24278c2ecf20Sopenharmony_ci				goto readpage_error;
24288c2ecf20Sopenharmony_ci			if (!PageUptodate(page)) {
24298c2ecf20Sopenharmony_ci				if (page->mapping == NULL) {
24308c2ecf20Sopenharmony_ci					/*
24318c2ecf20Sopenharmony_ci					 * invalidate_mapping_pages got it
24328c2ecf20Sopenharmony_ci					 */
24338c2ecf20Sopenharmony_ci					unlock_page(page);
24348c2ecf20Sopenharmony_ci					put_page(page);
24358c2ecf20Sopenharmony_ci					goto find_page;
24368c2ecf20Sopenharmony_ci				}
24378c2ecf20Sopenharmony_ci				unlock_page(page);
24388c2ecf20Sopenharmony_ci				shrink_readahead_size_eio(ra);
24398c2ecf20Sopenharmony_ci				error = -EIO;
24408c2ecf20Sopenharmony_ci				goto readpage_error;
24418c2ecf20Sopenharmony_ci			}
24428c2ecf20Sopenharmony_ci			unlock_page(page);
24438c2ecf20Sopenharmony_ci		}
24448c2ecf20Sopenharmony_ci
24458c2ecf20Sopenharmony_ci		goto page_ok;
24468c2ecf20Sopenharmony_ci
24478c2ecf20Sopenharmony_cireadpage_error:
24488c2ecf20Sopenharmony_ci		/* UHHUH! A synchronous read error occurred. Report it */
24498c2ecf20Sopenharmony_ci		put_page(page);
24508c2ecf20Sopenharmony_ci		goto out;
24518c2ecf20Sopenharmony_ci
24528c2ecf20Sopenharmony_cino_cached_page:
24538c2ecf20Sopenharmony_ci		/*
24548c2ecf20Sopenharmony_ci		 * Ok, it wasn't cached, so we need to create a new
24558c2ecf20Sopenharmony_ci		 * page..
24568c2ecf20Sopenharmony_ci		 */
24578c2ecf20Sopenharmony_ci		page = page_cache_alloc(mapping);
24588c2ecf20Sopenharmony_ci		if (!page) {
24598c2ecf20Sopenharmony_ci			error = -ENOMEM;
24608c2ecf20Sopenharmony_ci			goto out;
24618c2ecf20Sopenharmony_ci		}
24628c2ecf20Sopenharmony_ci		error = add_to_page_cache_lru(page, mapping, index,
24638c2ecf20Sopenharmony_ci				mapping_gfp_constraint(mapping, GFP_KERNEL));
24648c2ecf20Sopenharmony_ci		if (error) {
24658c2ecf20Sopenharmony_ci			put_page(page);
24668c2ecf20Sopenharmony_ci			if (error == -EEXIST) {
24678c2ecf20Sopenharmony_ci				error = 0;
24688c2ecf20Sopenharmony_ci				goto find_page;
24698c2ecf20Sopenharmony_ci			}
24708c2ecf20Sopenharmony_ci			goto out;
24718c2ecf20Sopenharmony_ci		}
24728c2ecf20Sopenharmony_ci		goto readpage;
24738c2ecf20Sopenharmony_ci	}
24748c2ecf20Sopenharmony_ci
24758c2ecf20Sopenharmony_ciwould_block:
24768c2ecf20Sopenharmony_ci	error = -EAGAIN;
24778c2ecf20Sopenharmony_ciout:
24788c2ecf20Sopenharmony_ci	ra->prev_pos = prev_index;
24798c2ecf20Sopenharmony_ci	ra->prev_pos <<= PAGE_SHIFT;
24808c2ecf20Sopenharmony_ci	ra->prev_pos |= prev_offset;
24818c2ecf20Sopenharmony_ci
24828c2ecf20Sopenharmony_ci	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
24838c2ecf20Sopenharmony_ci	file_accessed(filp);
24848c2ecf20Sopenharmony_ci	return written ? written : error;
24858c2ecf20Sopenharmony_ci}
24868c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(generic_file_buffered_read);
24878c2ecf20Sopenharmony_ci
24888c2ecf20Sopenharmony_ci/**
24898c2ecf20Sopenharmony_ci * generic_file_read_iter - generic filesystem read routine
24908c2ecf20Sopenharmony_ci * @iocb:	kernel I/O control block
24918c2ecf20Sopenharmony_ci * @iter:	destination for the data read
24928c2ecf20Sopenharmony_ci *
24938c2ecf20Sopenharmony_ci * This is the "read_iter()" routine for all filesystems
24948c2ecf20Sopenharmony_ci * that can use the page cache directly.
24958c2ecf20Sopenharmony_ci *
24968c2ecf20Sopenharmony_ci * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
24978c2ecf20Sopenharmony_ci * be returned when no data can be read without waiting for I/O requests
24988c2ecf20Sopenharmony_ci * to complete; it doesn't prevent readahead.
24998c2ecf20Sopenharmony_ci *
25008c2ecf20Sopenharmony_ci * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
25018c2ecf20Sopenharmony_ci * requests shall be made for the read or for readahead.  When no data
25028c2ecf20Sopenharmony_ci * can be read, -EAGAIN shall be returned.  When readahead would be
25038c2ecf20Sopenharmony_ci * triggered, a partial, possibly empty read shall be returned.
25048c2ecf20Sopenharmony_ci *
25058c2ecf20Sopenharmony_ci * Return:
25068c2ecf20Sopenharmony_ci * * number of bytes copied, even for partial reads
25078c2ecf20Sopenharmony_ci * * negative error code (or 0 if IOCB_NOIO) if nothing was read
25088c2ecf20Sopenharmony_ci */
25098c2ecf20Sopenharmony_cissize_t
25108c2ecf20Sopenharmony_cigeneric_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
25118c2ecf20Sopenharmony_ci{
25128c2ecf20Sopenharmony_ci	size_t count = iov_iter_count(iter);
25138c2ecf20Sopenharmony_ci	ssize_t retval = 0;
25148c2ecf20Sopenharmony_ci
25158c2ecf20Sopenharmony_ci	if (!count)
25168c2ecf20Sopenharmony_ci		goto out; /* skip atime */
25178c2ecf20Sopenharmony_ci
25188c2ecf20Sopenharmony_ci	if (iocb->ki_flags & IOCB_DIRECT) {
25198c2ecf20Sopenharmony_ci		struct file *file = iocb->ki_filp;
25208c2ecf20Sopenharmony_ci		struct address_space *mapping = file->f_mapping;
25218c2ecf20Sopenharmony_ci		struct inode *inode = mapping->host;
25228c2ecf20Sopenharmony_ci		loff_t size;
25238c2ecf20Sopenharmony_ci
25248c2ecf20Sopenharmony_ci		size = i_size_read(inode);
25258c2ecf20Sopenharmony_ci		if (iocb->ki_flags & IOCB_NOWAIT) {
25268c2ecf20Sopenharmony_ci			if (filemap_range_has_page(mapping, iocb->ki_pos,
25278c2ecf20Sopenharmony_ci						   iocb->ki_pos + count - 1))
25288c2ecf20Sopenharmony_ci				return -EAGAIN;
25298c2ecf20Sopenharmony_ci		} else {
25308c2ecf20Sopenharmony_ci			retval = filemap_write_and_wait_range(mapping,
25318c2ecf20Sopenharmony_ci						iocb->ki_pos,
25328c2ecf20Sopenharmony_ci					        iocb->ki_pos + count - 1);
25338c2ecf20Sopenharmony_ci			if (retval < 0)
25348c2ecf20Sopenharmony_ci				goto out;
25358c2ecf20Sopenharmony_ci		}
25368c2ecf20Sopenharmony_ci
25378c2ecf20Sopenharmony_ci		file_accessed(file);
25388c2ecf20Sopenharmony_ci
25398c2ecf20Sopenharmony_ci		retval = mapping->a_ops->direct_IO(iocb, iter);
25408c2ecf20Sopenharmony_ci		if (retval >= 0) {
25418c2ecf20Sopenharmony_ci			iocb->ki_pos += retval;
25428c2ecf20Sopenharmony_ci			count -= retval;
25438c2ecf20Sopenharmony_ci		}
25448c2ecf20Sopenharmony_ci		iov_iter_revert(iter, count - iov_iter_count(iter));
25458c2ecf20Sopenharmony_ci
25468c2ecf20Sopenharmony_ci		/*
25478c2ecf20Sopenharmony_ci		 * Btrfs can have a short DIO read if we encounter
25488c2ecf20Sopenharmony_ci		 * compressed extents, so if there was an error, or if
25498c2ecf20Sopenharmony_ci		 * we've already read everything we wanted to, or if
25508c2ecf20Sopenharmony_ci		 * there was a short read because we hit EOF, go ahead
25518c2ecf20Sopenharmony_ci		 * and return.  Otherwise fallthrough to buffered io for
25528c2ecf20Sopenharmony_ci		 * the rest of the read.  Buffered reads will not work for
25538c2ecf20Sopenharmony_ci		 * DAX files, so don't bother trying.
25548c2ecf20Sopenharmony_ci		 */
25558c2ecf20Sopenharmony_ci		if (retval < 0 || !count || iocb->ki_pos >= size ||
25568c2ecf20Sopenharmony_ci		    IS_DAX(inode))
25578c2ecf20Sopenharmony_ci			goto out;
25588c2ecf20Sopenharmony_ci	}
25598c2ecf20Sopenharmony_ci
25608c2ecf20Sopenharmony_ci	retval = generic_file_buffered_read(iocb, iter, retval);
25618c2ecf20Sopenharmony_ciout:
25628c2ecf20Sopenharmony_ci	return retval;
25638c2ecf20Sopenharmony_ci}
25648c2ecf20Sopenharmony_ciEXPORT_SYMBOL(generic_file_read_iter);
25658c2ecf20Sopenharmony_ci
25668c2ecf20Sopenharmony_ci#ifdef CONFIG_MMU
25678c2ecf20Sopenharmony_ci#define MMAP_LOTSAMISS  (100)
25688c2ecf20Sopenharmony_ci/*
25698c2ecf20Sopenharmony_ci * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
25708c2ecf20Sopenharmony_ci * @vmf - the vm_fault for this fault.
25718c2ecf20Sopenharmony_ci * @page - the page to lock.
25728c2ecf20Sopenharmony_ci * @fpin - the pointer to the file we may pin (or is already pinned).
25738c2ecf20Sopenharmony_ci *
25748c2ecf20Sopenharmony_ci * This works similar to lock_page_or_retry in that it can drop the mmap_lock.
25758c2ecf20Sopenharmony_ci * It differs in that it actually returns the page locked if it returns 1 and 0
25768c2ecf20Sopenharmony_ci * if it couldn't lock the page.  If we did have to drop the mmap_lock then fpin
25778c2ecf20Sopenharmony_ci * will point to the pinned file and needs to be fput()'ed at a later point.
25788c2ecf20Sopenharmony_ci */
25798c2ecf20Sopenharmony_cistatic int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
25808c2ecf20Sopenharmony_ci				     struct file **fpin)
25818c2ecf20Sopenharmony_ci{
25828c2ecf20Sopenharmony_ci	if (trylock_page(page))
25838c2ecf20Sopenharmony_ci		return 1;
25848c2ecf20Sopenharmony_ci
25858c2ecf20Sopenharmony_ci	/*
25868c2ecf20Sopenharmony_ci	 * NOTE! This will make us return with VM_FAULT_RETRY, but with
25878c2ecf20Sopenharmony_ci	 * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
25888c2ecf20Sopenharmony_ci	 * is supposed to work. We have way too many special cases..
25898c2ecf20Sopenharmony_ci	 */
25908c2ecf20Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
25918c2ecf20Sopenharmony_ci		return 0;
25928c2ecf20Sopenharmony_ci
25938c2ecf20Sopenharmony_ci	*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
25948c2ecf20Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_KILLABLE) {
25958c2ecf20Sopenharmony_ci		if (__lock_page_killable(page)) {
25968c2ecf20Sopenharmony_ci			/*
25978c2ecf20Sopenharmony_ci			 * We didn't have the right flags to drop the mmap_lock,
25988c2ecf20Sopenharmony_ci			 * but all fault_handlers only check for fatal signals
25998c2ecf20Sopenharmony_ci			 * if we return VM_FAULT_RETRY, so we need to drop the
26008c2ecf20Sopenharmony_ci			 * mmap_lock here and return 0 if we don't have a fpin.
26018c2ecf20Sopenharmony_ci			 */
26028c2ecf20Sopenharmony_ci			if (*fpin == NULL)
26038c2ecf20Sopenharmony_ci				mmap_read_unlock(vmf->vma->vm_mm);
26048c2ecf20Sopenharmony_ci			return 0;
26058c2ecf20Sopenharmony_ci		}
26068c2ecf20Sopenharmony_ci	} else
26078c2ecf20Sopenharmony_ci		__lock_page(page);
26088c2ecf20Sopenharmony_ci	return 1;
26098c2ecf20Sopenharmony_ci}
26108c2ecf20Sopenharmony_ci
26118c2ecf20Sopenharmony_ci
26128c2ecf20Sopenharmony_ci/*
26138c2ecf20Sopenharmony_ci * Synchronous readahead happens when we don't even find a page in the page
26148c2ecf20Sopenharmony_ci * cache at all.  We don't want to perform IO under the mmap sem, so if we have
26158c2ecf20Sopenharmony_ci * to drop the mmap sem we return the file that was pinned in order for us to do
26168c2ecf20Sopenharmony_ci * that.  If we didn't pin a file then we return NULL.  The file that is
26178c2ecf20Sopenharmony_ci * returned needs to be fput()'ed when we're done with it.
26188c2ecf20Sopenharmony_ci */
26198c2ecf20Sopenharmony_cistatic struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
26208c2ecf20Sopenharmony_ci{
26218c2ecf20Sopenharmony_ci	struct file *file = vmf->vma->vm_file;
26228c2ecf20Sopenharmony_ci	struct file_ra_state *ra = &file->f_ra;
26238c2ecf20Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
26248c2ecf20Sopenharmony_ci	DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
26258c2ecf20Sopenharmony_ci	struct file *fpin = NULL;
26268c2ecf20Sopenharmony_ci	unsigned int mmap_miss;
26278c2ecf20Sopenharmony_ci
26288c2ecf20Sopenharmony_ci	/* If we don't want any read-ahead, don't bother */
26298c2ecf20Sopenharmony_ci	if (vmf->vma->vm_flags & VM_RAND_READ)
26308c2ecf20Sopenharmony_ci		return fpin;
26318c2ecf20Sopenharmony_ci	if (!ra->ra_pages)
26328c2ecf20Sopenharmony_ci		return fpin;
26338c2ecf20Sopenharmony_ci
26348c2ecf20Sopenharmony_ci	if (vmf->vma->vm_flags & VM_SEQ_READ) {
26358c2ecf20Sopenharmony_ci		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
26368c2ecf20Sopenharmony_ci		page_cache_sync_ra(&ractl, ra, ra->ra_pages);
26378c2ecf20Sopenharmony_ci		return fpin;
26388c2ecf20Sopenharmony_ci	}
26398c2ecf20Sopenharmony_ci
26408c2ecf20Sopenharmony_ci	/* Avoid banging the cache line if not needed */
26418c2ecf20Sopenharmony_ci	mmap_miss = READ_ONCE(ra->mmap_miss);
26428c2ecf20Sopenharmony_ci	if (mmap_miss < MMAP_LOTSAMISS * 10)
26438c2ecf20Sopenharmony_ci		WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
26448c2ecf20Sopenharmony_ci
26458c2ecf20Sopenharmony_ci	/*
26468c2ecf20Sopenharmony_ci	 * Do we miss much more than hit in this file? If so,
26478c2ecf20Sopenharmony_ci	 * stop bothering with read-ahead. It will only hurt.
26488c2ecf20Sopenharmony_ci	 */
26498c2ecf20Sopenharmony_ci	if (mmap_miss > MMAP_LOTSAMISS)
26508c2ecf20Sopenharmony_ci		return fpin;
26518c2ecf20Sopenharmony_ci
26528c2ecf20Sopenharmony_ci	/*
26538c2ecf20Sopenharmony_ci	 * mmap read-around
26548c2ecf20Sopenharmony_ci	 */
26558c2ecf20Sopenharmony_ci	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
26568c2ecf20Sopenharmony_ci	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
26578c2ecf20Sopenharmony_ci	ra->size = ra->ra_pages;
26588c2ecf20Sopenharmony_ci	ra->async_size = ra->ra_pages / 4;
26598c2ecf20Sopenharmony_ci	ractl._index = ra->start;
26608c2ecf20Sopenharmony_ci	do_page_cache_ra(&ractl, ra->size, ra->async_size);
26618c2ecf20Sopenharmony_ci	return fpin;
26628c2ecf20Sopenharmony_ci}
26638c2ecf20Sopenharmony_ci
26648c2ecf20Sopenharmony_ci/*
26658c2ecf20Sopenharmony_ci * Asynchronous readahead happens when we find the page and PG_readahead,
26668c2ecf20Sopenharmony_ci * so we want to possibly extend the readahead further.  We return the file that
26678c2ecf20Sopenharmony_ci * was pinned if we have to drop the mmap_lock in order to do IO.
26688c2ecf20Sopenharmony_ci */
26698c2ecf20Sopenharmony_cistatic struct file *do_async_mmap_readahead(struct vm_fault *vmf,
26708c2ecf20Sopenharmony_ci					    struct page *page)
26718c2ecf20Sopenharmony_ci{
26728c2ecf20Sopenharmony_ci	struct file *file = vmf->vma->vm_file;
26738c2ecf20Sopenharmony_ci	struct file_ra_state *ra = &file->f_ra;
26748c2ecf20Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
26758c2ecf20Sopenharmony_ci	struct file *fpin = NULL;
26768c2ecf20Sopenharmony_ci	unsigned int mmap_miss;
26778c2ecf20Sopenharmony_ci	pgoff_t offset = vmf->pgoff;
26788c2ecf20Sopenharmony_ci
26798c2ecf20Sopenharmony_ci	/* If we don't want any read-ahead, don't bother */
26808c2ecf20Sopenharmony_ci	if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
26818c2ecf20Sopenharmony_ci		return fpin;
26828c2ecf20Sopenharmony_ci	mmap_miss = READ_ONCE(ra->mmap_miss);
26838c2ecf20Sopenharmony_ci	if (mmap_miss)
26848c2ecf20Sopenharmony_ci		WRITE_ONCE(ra->mmap_miss, --mmap_miss);
26858c2ecf20Sopenharmony_ci	if (PageReadahead(page)) {
26868c2ecf20Sopenharmony_ci		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
26878c2ecf20Sopenharmony_ci		page_cache_async_readahead(mapping, ra, file,
26888c2ecf20Sopenharmony_ci					   page, offset, ra->ra_pages);
26898c2ecf20Sopenharmony_ci	}
26908c2ecf20Sopenharmony_ci	return fpin;
26918c2ecf20Sopenharmony_ci}
26928c2ecf20Sopenharmony_ci
26938c2ecf20Sopenharmony_ci/**
26948c2ecf20Sopenharmony_ci * filemap_fault - read in file data for page fault handling
26958c2ecf20Sopenharmony_ci * @vmf:	struct vm_fault containing details of the fault
26968c2ecf20Sopenharmony_ci *
26978c2ecf20Sopenharmony_ci * filemap_fault() is invoked via the vma operations vector for a
26988c2ecf20Sopenharmony_ci * mapped memory region to read in file data during a page fault.
26998c2ecf20Sopenharmony_ci *
27008c2ecf20Sopenharmony_ci * The goto's are kind of ugly, but this streamlines the normal case of having
27018c2ecf20Sopenharmony_ci * it in the page cache, and handles the special cases reasonably without
27028c2ecf20Sopenharmony_ci * having a lot of duplicated code.
27038c2ecf20Sopenharmony_ci *
27048c2ecf20Sopenharmony_ci * vma->vm_mm->mmap_lock must be held on entry.
27058c2ecf20Sopenharmony_ci *
27068c2ecf20Sopenharmony_ci * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
27078c2ecf20Sopenharmony_ci * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
27088c2ecf20Sopenharmony_ci *
27098c2ecf20Sopenharmony_ci * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
27108c2ecf20Sopenharmony_ci * has not been released.
27118c2ecf20Sopenharmony_ci *
27128c2ecf20Sopenharmony_ci * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
27138c2ecf20Sopenharmony_ci *
27148c2ecf20Sopenharmony_ci * Return: bitwise-OR of %VM_FAULT_ codes.
27158c2ecf20Sopenharmony_ci */
27168c2ecf20Sopenharmony_civm_fault_t filemap_fault(struct vm_fault *vmf)
27178c2ecf20Sopenharmony_ci{
27188c2ecf20Sopenharmony_ci	int error;
27198c2ecf20Sopenharmony_ci	struct file *file = vmf->vma->vm_file;
27208c2ecf20Sopenharmony_ci	struct file *fpin = NULL;
27218c2ecf20Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
27228c2ecf20Sopenharmony_ci	struct file_ra_state *ra = &file->f_ra;
27238c2ecf20Sopenharmony_ci	struct inode *inode = mapping->host;
27248c2ecf20Sopenharmony_ci	pgoff_t offset = vmf->pgoff;
27258c2ecf20Sopenharmony_ci	pgoff_t max_off;
27268c2ecf20Sopenharmony_ci	struct page *page;
27278c2ecf20Sopenharmony_ci	vm_fault_t ret = 0;
27288c2ecf20Sopenharmony_ci
27298c2ecf20Sopenharmony_ci	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
27308c2ecf20Sopenharmony_ci	if (unlikely(offset >= max_off))
27318c2ecf20Sopenharmony_ci		return VM_FAULT_SIGBUS;
27328c2ecf20Sopenharmony_ci
27338c2ecf20Sopenharmony_ci	/*
27348c2ecf20Sopenharmony_ci	 * Do we have something in the page cache already?
27358c2ecf20Sopenharmony_ci	 */
27368c2ecf20Sopenharmony_ci	page = find_get_page(mapping, offset);
27378c2ecf20Sopenharmony_ci	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
27388c2ecf20Sopenharmony_ci		/*
27398c2ecf20Sopenharmony_ci		 * We found the page, so try async readahead before
27408c2ecf20Sopenharmony_ci		 * waiting for the lock.
27418c2ecf20Sopenharmony_ci		 */
27428c2ecf20Sopenharmony_ci		fpin = do_async_mmap_readahead(vmf, page);
27438c2ecf20Sopenharmony_ci	} else if (!page) {
27448c2ecf20Sopenharmony_ci		/* No page in the page cache at all */
27458c2ecf20Sopenharmony_ci		count_vm_event(PGMAJFAULT);
27468c2ecf20Sopenharmony_ci		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
27478c2ecf20Sopenharmony_ci		ret = VM_FAULT_MAJOR;
27488c2ecf20Sopenharmony_ci		fpin = do_sync_mmap_readahead(vmf);
27498c2ecf20Sopenharmony_ciretry_find:
27508c2ecf20Sopenharmony_ci		page = pagecache_get_page(mapping, offset,
27518c2ecf20Sopenharmony_ci					  FGP_CREAT|FGP_FOR_MMAP,
27528c2ecf20Sopenharmony_ci					  vmf->gfp_mask);
27538c2ecf20Sopenharmony_ci		if (!page) {
27548c2ecf20Sopenharmony_ci			if (fpin)
27558c2ecf20Sopenharmony_ci				goto out_retry;
27568c2ecf20Sopenharmony_ci			return VM_FAULT_OOM;
27578c2ecf20Sopenharmony_ci		}
27588c2ecf20Sopenharmony_ci	}
27598c2ecf20Sopenharmony_ci
27608c2ecf20Sopenharmony_ci	if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
27618c2ecf20Sopenharmony_ci		goto out_retry;
27628c2ecf20Sopenharmony_ci
27638c2ecf20Sopenharmony_ci	/* Did it get truncated? */
27648c2ecf20Sopenharmony_ci	if (unlikely(compound_head(page)->mapping != mapping)) {
27658c2ecf20Sopenharmony_ci		unlock_page(page);
27668c2ecf20Sopenharmony_ci		put_page(page);
27678c2ecf20Sopenharmony_ci		goto retry_find;
27688c2ecf20Sopenharmony_ci	}
27698c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
27708c2ecf20Sopenharmony_ci
27718c2ecf20Sopenharmony_ci	/*
27728c2ecf20Sopenharmony_ci	 * We have a locked page in the page cache, now we need to check
27738c2ecf20Sopenharmony_ci	 * that it's up-to-date. If not, it is going to be due to an error.
27748c2ecf20Sopenharmony_ci	 */
27758c2ecf20Sopenharmony_ci	if (unlikely(!PageUptodate(page)))
27768c2ecf20Sopenharmony_ci		goto page_not_uptodate;
27778c2ecf20Sopenharmony_ci
27788c2ecf20Sopenharmony_ci	/*
27798c2ecf20Sopenharmony_ci	 * We've made it this far and we had to drop our mmap_lock, now is the
27808c2ecf20Sopenharmony_ci	 * time to return to the upper layer and have it re-find the vma and
27818c2ecf20Sopenharmony_ci	 * redo the fault.
27828c2ecf20Sopenharmony_ci	 */
27838c2ecf20Sopenharmony_ci	if (fpin) {
27848c2ecf20Sopenharmony_ci		unlock_page(page);
27858c2ecf20Sopenharmony_ci		goto out_retry;
27868c2ecf20Sopenharmony_ci	}
27878c2ecf20Sopenharmony_ci
27888c2ecf20Sopenharmony_ci	/*
27898c2ecf20Sopenharmony_ci	 * Found the page and have a reference on it.
27908c2ecf20Sopenharmony_ci	 * We must recheck i_size under page lock.
27918c2ecf20Sopenharmony_ci	 */
27928c2ecf20Sopenharmony_ci	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
27938c2ecf20Sopenharmony_ci	if (unlikely(offset >= max_off)) {
27948c2ecf20Sopenharmony_ci		unlock_page(page);
27958c2ecf20Sopenharmony_ci		put_page(page);
27968c2ecf20Sopenharmony_ci		return VM_FAULT_SIGBUS;
27978c2ecf20Sopenharmony_ci	}
27988c2ecf20Sopenharmony_ci
27998c2ecf20Sopenharmony_ci	vmf->page = page;
28008c2ecf20Sopenharmony_ci	return ret | VM_FAULT_LOCKED;
28018c2ecf20Sopenharmony_ci
28028c2ecf20Sopenharmony_cipage_not_uptodate:
28038c2ecf20Sopenharmony_ci	/*
28048c2ecf20Sopenharmony_ci	 * Umm, take care of errors if the page isn't up-to-date.
28058c2ecf20Sopenharmony_ci	 * Try to re-read it _once_. We do this synchronously,
28068c2ecf20Sopenharmony_ci	 * because there really aren't any performance issues here
28078c2ecf20Sopenharmony_ci	 * and we need to check for errors.
28088c2ecf20Sopenharmony_ci	 */
28098c2ecf20Sopenharmony_ci	ClearPageError(page);
28108c2ecf20Sopenharmony_ci	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
28118c2ecf20Sopenharmony_ci	error = mapping->a_ops->readpage(file, page);
28128c2ecf20Sopenharmony_ci	if (!error) {
28138c2ecf20Sopenharmony_ci		wait_on_page_locked(page);
28148c2ecf20Sopenharmony_ci		if (!PageUptodate(page))
28158c2ecf20Sopenharmony_ci			error = -EIO;
28168c2ecf20Sopenharmony_ci	}
28178c2ecf20Sopenharmony_ci	if (fpin)
28188c2ecf20Sopenharmony_ci		goto out_retry;
28198c2ecf20Sopenharmony_ci	put_page(page);
28208c2ecf20Sopenharmony_ci
28218c2ecf20Sopenharmony_ci	if (!error || error == AOP_TRUNCATED_PAGE)
28228c2ecf20Sopenharmony_ci		goto retry_find;
28238c2ecf20Sopenharmony_ci
28248c2ecf20Sopenharmony_ci	shrink_readahead_size_eio(ra);
28258c2ecf20Sopenharmony_ci	return VM_FAULT_SIGBUS;
28268c2ecf20Sopenharmony_ci
28278c2ecf20Sopenharmony_ciout_retry:
28288c2ecf20Sopenharmony_ci	/*
28298c2ecf20Sopenharmony_ci	 * We dropped the mmap_lock, we need to return to the fault handler to
28308c2ecf20Sopenharmony_ci	 * re-find the vma and come back and find our hopefully still populated
28318c2ecf20Sopenharmony_ci	 * page.
28328c2ecf20Sopenharmony_ci	 */
28338c2ecf20Sopenharmony_ci	if (page)
28348c2ecf20Sopenharmony_ci		put_page(page);
28358c2ecf20Sopenharmony_ci	if (fpin)
28368c2ecf20Sopenharmony_ci		fput(fpin);
28378c2ecf20Sopenharmony_ci	return ret | VM_FAULT_RETRY;
28388c2ecf20Sopenharmony_ci}
28398c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_fault);
28408c2ecf20Sopenharmony_ci
28418c2ecf20Sopenharmony_civoid filemap_map_pages(struct vm_fault *vmf,
28428c2ecf20Sopenharmony_ci		pgoff_t start_pgoff, pgoff_t end_pgoff)
28438c2ecf20Sopenharmony_ci{
28448c2ecf20Sopenharmony_ci	struct file *file = vmf->vma->vm_file;
28458c2ecf20Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
28468c2ecf20Sopenharmony_ci	pgoff_t last_pgoff = start_pgoff;
28478c2ecf20Sopenharmony_ci	unsigned long max_idx;
28488c2ecf20Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, start_pgoff);
28498c2ecf20Sopenharmony_ci	struct page *head, *page;
28508c2ecf20Sopenharmony_ci	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
28518c2ecf20Sopenharmony_ci
28528c2ecf20Sopenharmony_ci	rcu_read_lock();
28538c2ecf20Sopenharmony_ci	xas_for_each(&xas, head, end_pgoff) {
28548c2ecf20Sopenharmony_ci		if (xas_retry(&xas, head))
28558c2ecf20Sopenharmony_ci			continue;
28568c2ecf20Sopenharmony_ci		if (xa_is_value(head))
28578c2ecf20Sopenharmony_ci			goto next;
28588c2ecf20Sopenharmony_ci
28598c2ecf20Sopenharmony_ci		/*
28608c2ecf20Sopenharmony_ci		 * Check for a locked page first, as a speculative
28618c2ecf20Sopenharmony_ci		 * reference may adversely influence page migration.
28628c2ecf20Sopenharmony_ci		 */
28638c2ecf20Sopenharmony_ci		if (PageLocked(head))
28648c2ecf20Sopenharmony_ci			goto next;
28658c2ecf20Sopenharmony_ci		if (!page_cache_get_speculative(head))
28668c2ecf20Sopenharmony_ci			goto next;
28678c2ecf20Sopenharmony_ci
28688c2ecf20Sopenharmony_ci		/* Has the page moved or been split? */
28698c2ecf20Sopenharmony_ci		if (unlikely(head != xas_reload(&xas)))
28708c2ecf20Sopenharmony_ci			goto skip;
28718c2ecf20Sopenharmony_ci		page = find_subpage(head, xas.xa_index);
28728c2ecf20Sopenharmony_ci
28738c2ecf20Sopenharmony_ci		if (!PageUptodate(head) ||
28748c2ecf20Sopenharmony_ci				PageReadahead(page) ||
28758c2ecf20Sopenharmony_ci				PageHWPoison(page))
28768c2ecf20Sopenharmony_ci			goto skip;
28778c2ecf20Sopenharmony_ci		if (!trylock_page(head))
28788c2ecf20Sopenharmony_ci			goto skip;
28798c2ecf20Sopenharmony_ci
28808c2ecf20Sopenharmony_ci		if (head->mapping != mapping || !PageUptodate(head))
28818c2ecf20Sopenharmony_ci			goto unlock;
28828c2ecf20Sopenharmony_ci
28838c2ecf20Sopenharmony_ci		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
28848c2ecf20Sopenharmony_ci		if (xas.xa_index >= max_idx)
28858c2ecf20Sopenharmony_ci			goto unlock;
28868c2ecf20Sopenharmony_ci
28878c2ecf20Sopenharmony_ci		if (mmap_miss > 0)
28888c2ecf20Sopenharmony_ci			mmap_miss--;
28898c2ecf20Sopenharmony_ci
28908c2ecf20Sopenharmony_ci		vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
28918c2ecf20Sopenharmony_ci		if (vmf->pte)
28928c2ecf20Sopenharmony_ci			vmf->pte += xas.xa_index - last_pgoff;
28938c2ecf20Sopenharmony_ci		last_pgoff = xas.xa_index;
28948c2ecf20Sopenharmony_ci		if (alloc_set_pte(vmf, page))
28958c2ecf20Sopenharmony_ci			goto unlock;
28968c2ecf20Sopenharmony_ci		unlock_page(head);
28978c2ecf20Sopenharmony_ci		goto next;
28988c2ecf20Sopenharmony_ciunlock:
28998c2ecf20Sopenharmony_ci		unlock_page(head);
29008c2ecf20Sopenharmony_ciskip:
29018c2ecf20Sopenharmony_ci		put_page(head);
29028c2ecf20Sopenharmony_cinext:
29038c2ecf20Sopenharmony_ci		/* Huge page is mapped? No need to proceed. */
29048c2ecf20Sopenharmony_ci		if (pmd_trans_huge(*vmf->pmd))
29058c2ecf20Sopenharmony_ci			break;
29068c2ecf20Sopenharmony_ci	}
29078c2ecf20Sopenharmony_ci	rcu_read_unlock();
29088c2ecf20Sopenharmony_ci	WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
29098c2ecf20Sopenharmony_ci}
29108c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_map_pages);
29118c2ecf20Sopenharmony_ci
29128c2ecf20Sopenharmony_civm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
29138c2ecf20Sopenharmony_ci{
29148c2ecf20Sopenharmony_ci	struct page *page = vmf->page;
29158c2ecf20Sopenharmony_ci	struct inode *inode = file_inode(vmf->vma->vm_file);
29168c2ecf20Sopenharmony_ci	vm_fault_t ret = VM_FAULT_LOCKED;
29178c2ecf20Sopenharmony_ci
29188c2ecf20Sopenharmony_ci	sb_start_pagefault(inode->i_sb);
29198c2ecf20Sopenharmony_ci	file_update_time(vmf->vma->vm_file);
29208c2ecf20Sopenharmony_ci	lock_page(page);
29218c2ecf20Sopenharmony_ci	if (page->mapping != inode->i_mapping) {
29228c2ecf20Sopenharmony_ci		unlock_page(page);
29238c2ecf20Sopenharmony_ci		ret = VM_FAULT_NOPAGE;
29248c2ecf20Sopenharmony_ci		goto out;
29258c2ecf20Sopenharmony_ci	}
29268c2ecf20Sopenharmony_ci	/*
29278c2ecf20Sopenharmony_ci	 * We mark the page dirty already here so that when freeze is in
29288c2ecf20Sopenharmony_ci	 * progress, we are guaranteed that writeback during freezing will
29298c2ecf20Sopenharmony_ci	 * see the dirty page and writeprotect it again.
29308c2ecf20Sopenharmony_ci	 */
29318c2ecf20Sopenharmony_ci	set_page_dirty(page);
29328c2ecf20Sopenharmony_ci	wait_for_stable_page(page);
29338c2ecf20Sopenharmony_ciout:
29348c2ecf20Sopenharmony_ci	sb_end_pagefault(inode->i_sb);
29358c2ecf20Sopenharmony_ci	return ret;
29368c2ecf20Sopenharmony_ci}
29378c2ecf20Sopenharmony_ci
29388c2ecf20Sopenharmony_ciconst struct vm_operations_struct generic_file_vm_ops = {
29398c2ecf20Sopenharmony_ci	.fault		= filemap_fault,
29408c2ecf20Sopenharmony_ci	.map_pages	= filemap_map_pages,
29418c2ecf20Sopenharmony_ci	.page_mkwrite	= filemap_page_mkwrite,
29428c2ecf20Sopenharmony_ci};
29438c2ecf20Sopenharmony_ci
29448c2ecf20Sopenharmony_ci/* This is used for a general mmap of a disk file */
29458c2ecf20Sopenharmony_ci
29468c2ecf20Sopenharmony_ciint generic_file_mmap(struct file * file, struct vm_area_struct * vma)
29478c2ecf20Sopenharmony_ci{
29488c2ecf20Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
29498c2ecf20Sopenharmony_ci
29508c2ecf20Sopenharmony_ci	if (!mapping->a_ops->readpage)
29518c2ecf20Sopenharmony_ci		return -ENOEXEC;
29528c2ecf20Sopenharmony_ci	file_accessed(file);
29538c2ecf20Sopenharmony_ci	vma->vm_ops = &generic_file_vm_ops;
29548c2ecf20Sopenharmony_ci	return 0;
29558c2ecf20Sopenharmony_ci}
29568c2ecf20Sopenharmony_ci
29578c2ecf20Sopenharmony_ci/*
29588c2ecf20Sopenharmony_ci * This is for filesystems which do not implement ->writepage.
29598c2ecf20Sopenharmony_ci */
29608c2ecf20Sopenharmony_ciint generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
29618c2ecf20Sopenharmony_ci{
29628c2ecf20Sopenharmony_ci	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
29638c2ecf20Sopenharmony_ci		return -EINVAL;
29648c2ecf20Sopenharmony_ci	return generic_file_mmap(file, vma);
29658c2ecf20Sopenharmony_ci}
29668c2ecf20Sopenharmony_ci#else
29678c2ecf20Sopenharmony_civm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
29688c2ecf20Sopenharmony_ci{
29698c2ecf20Sopenharmony_ci	return VM_FAULT_SIGBUS;
29708c2ecf20Sopenharmony_ci}
29718c2ecf20Sopenharmony_ciint generic_file_mmap(struct file * file, struct vm_area_struct * vma)
29728c2ecf20Sopenharmony_ci{
29738c2ecf20Sopenharmony_ci	return -ENOSYS;
29748c2ecf20Sopenharmony_ci}
29758c2ecf20Sopenharmony_ciint generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
29768c2ecf20Sopenharmony_ci{
29778c2ecf20Sopenharmony_ci	return -ENOSYS;
29788c2ecf20Sopenharmony_ci}
29798c2ecf20Sopenharmony_ci#endif /* CONFIG_MMU */
29808c2ecf20Sopenharmony_ci
29818c2ecf20Sopenharmony_ciEXPORT_SYMBOL(filemap_page_mkwrite);
29828c2ecf20Sopenharmony_ciEXPORT_SYMBOL(generic_file_mmap);
29838c2ecf20Sopenharmony_ciEXPORT_SYMBOL(generic_file_readonly_mmap);
29848c2ecf20Sopenharmony_ci
29858c2ecf20Sopenharmony_cistatic struct page *wait_on_page_read(struct page *page)
29868c2ecf20Sopenharmony_ci{
29878c2ecf20Sopenharmony_ci	if (!IS_ERR(page)) {
29888c2ecf20Sopenharmony_ci		wait_on_page_locked(page);
29898c2ecf20Sopenharmony_ci		if (!PageUptodate(page)) {
29908c2ecf20Sopenharmony_ci			put_page(page);
29918c2ecf20Sopenharmony_ci			page = ERR_PTR(-EIO);
29928c2ecf20Sopenharmony_ci		}
29938c2ecf20Sopenharmony_ci	}
29948c2ecf20Sopenharmony_ci	return page;
29958c2ecf20Sopenharmony_ci}
29968c2ecf20Sopenharmony_ci
29978c2ecf20Sopenharmony_cistatic struct page *do_read_cache_page(struct address_space *mapping,
29988c2ecf20Sopenharmony_ci				pgoff_t index,
29998c2ecf20Sopenharmony_ci				int (*filler)(void *, struct page *),
30008c2ecf20Sopenharmony_ci				void *data,
30018c2ecf20Sopenharmony_ci				gfp_t gfp)
30028c2ecf20Sopenharmony_ci{
30038c2ecf20Sopenharmony_ci	struct page *page;
30048c2ecf20Sopenharmony_ci	int err;
30058c2ecf20Sopenharmony_cirepeat:
30068c2ecf20Sopenharmony_ci	page = find_get_page(mapping, index);
30078c2ecf20Sopenharmony_ci	if (!page) {
30088c2ecf20Sopenharmony_ci		page = __page_cache_alloc(gfp);
30098c2ecf20Sopenharmony_ci		if (!page)
30108c2ecf20Sopenharmony_ci			return ERR_PTR(-ENOMEM);
30118c2ecf20Sopenharmony_ci		err = add_to_page_cache_lru(page, mapping, index, gfp);
30128c2ecf20Sopenharmony_ci		if (unlikely(err)) {
30138c2ecf20Sopenharmony_ci			put_page(page);
30148c2ecf20Sopenharmony_ci			if (err == -EEXIST)
30158c2ecf20Sopenharmony_ci				goto repeat;
30168c2ecf20Sopenharmony_ci			/* Presumably ENOMEM for xarray node */
30178c2ecf20Sopenharmony_ci			return ERR_PTR(err);
30188c2ecf20Sopenharmony_ci		}
30198c2ecf20Sopenharmony_ci
30208c2ecf20Sopenharmony_cifiller:
30218c2ecf20Sopenharmony_ci		if (filler)
30228c2ecf20Sopenharmony_ci			err = filler(data, page);
30238c2ecf20Sopenharmony_ci		else
30248c2ecf20Sopenharmony_ci			err = mapping->a_ops->readpage(data, page);
30258c2ecf20Sopenharmony_ci
30268c2ecf20Sopenharmony_ci		if (err < 0) {
30278c2ecf20Sopenharmony_ci			put_page(page);
30288c2ecf20Sopenharmony_ci			return ERR_PTR(err);
30298c2ecf20Sopenharmony_ci		}
30308c2ecf20Sopenharmony_ci
30318c2ecf20Sopenharmony_ci		page = wait_on_page_read(page);
30328c2ecf20Sopenharmony_ci		if (IS_ERR(page))
30338c2ecf20Sopenharmony_ci			return page;
30348c2ecf20Sopenharmony_ci		goto out;
30358c2ecf20Sopenharmony_ci	}
30368c2ecf20Sopenharmony_ci	if (PageUptodate(page))
30378c2ecf20Sopenharmony_ci		goto out;
30388c2ecf20Sopenharmony_ci
30398c2ecf20Sopenharmony_ci	/*
30408c2ecf20Sopenharmony_ci	 * Page is not up to date and may be locked due to one of the following
30418c2ecf20Sopenharmony_ci	 * case a: Page is being filled and the page lock is held
30428c2ecf20Sopenharmony_ci	 * case b: Read/write error clearing the page uptodate status
30438c2ecf20Sopenharmony_ci	 * case c: Truncation in progress (page locked)
30448c2ecf20Sopenharmony_ci	 * case d: Reclaim in progress
30458c2ecf20Sopenharmony_ci	 *
30468c2ecf20Sopenharmony_ci	 * Case a, the page will be up to date when the page is unlocked.
30478c2ecf20Sopenharmony_ci	 *    There is no need to serialise on the page lock here as the page
30488c2ecf20Sopenharmony_ci	 *    is pinned so the lock gives no additional protection. Even if the
30498c2ecf20Sopenharmony_ci	 *    page is truncated, the data is still valid if PageUptodate as
30508c2ecf20Sopenharmony_ci	 *    it's a race vs truncate race.
30518c2ecf20Sopenharmony_ci	 * Case b, the page will not be up to date
30528c2ecf20Sopenharmony_ci	 * Case c, the page may be truncated but in itself, the data may still
30538c2ecf20Sopenharmony_ci	 *    be valid after IO completes as it's a read vs truncate race. The
30548c2ecf20Sopenharmony_ci	 *    operation must restart if the page is not uptodate on unlock but
30558c2ecf20Sopenharmony_ci	 *    otherwise serialising on page lock to stabilise the mapping gives
30568c2ecf20Sopenharmony_ci	 *    no additional guarantees to the caller as the page lock is
30578c2ecf20Sopenharmony_ci	 *    released before return.
30588c2ecf20Sopenharmony_ci	 * Case d, similar to truncation. If reclaim holds the page lock, it
30598c2ecf20Sopenharmony_ci	 *    will be a race with remove_mapping that determines if the mapping
30608c2ecf20Sopenharmony_ci	 *    is valid on unlock but otherwise the data is valid and there is
30618c2ecf20Sopenharmony_ci	 *    no need to serialise with page lock.
30628c2ecf20Sopenharmony_ci	 *
30638c2ecf20Sopenharmony_ci	 * As the page lock gives no additional guarantee, we optimistically
30648c2ecf20Sopenharmony_ci	 * wait on the page to be unlocked and check if it's up to date and
30658c2ecf20Sopenharmony_ci	 * use the page if it is. Otherwise, the page lock is required to
30668c2ecf20Sopenharmony_ci	 * distinguish between the different cases. The motivation is that we
30678c2ecf20Sopenharmony_ci	 * avoid spurious serialisations and wakeups when multiple processes
30688c2ecf20Sopenharmony_ci	 * wait on the same page for IO to complete.
30698c2ecf20Sopenharmony_ci	 */
30708c2ecf20Sopenharmony_ci	wait_on_page_locked(page);
30718c2ecf20Sopenharmony_ci	if (PageUptodate(page))
30728c2ecf20Sopenharmony_ci		goto out;
30738c2ecf20Sopenharmony_ci
30748c2ecf20Sopenharmony_ci	/* Distinguish between all the cases under the safety of the lock */
30758c2ecf20Sopenharmony_ci	lock_page(page);
30768c2ecf20Sopenharmony_ci
30778c2ecf20Sopenharmony_ci	/* Case c or d, restart the operation */
30788c2ecf20Sopenharmony_ci	if (!page->mapping) {
30798c2ecf20Sopenharmony_ci		unlock_page(page);
30808c2ecf20Sopenharmony_ci		put_page(page);
30818c2ecf20Sopenharmony_ci		goto repeat;
30828c2ecf20Sopenharmony_ci	}
30838c2ecf20Sopenharmony_ci
30848c2ecf20Sopenharmony_ci	/* Someone else locked and filled the page in a very small window */
30858c2ecf20Sopenharmony_ci	if (PageUptodate(page)) {
30868c2ecf20Sopenharmony_ci		unlock_page(page);
30878c2ecf20Sopenharmony_ci		goto out;
30888c2ecf20Sopenharmony_ci	}
30898c2ecf20Sopenharmony_ci
30908c2ecf20Sopenharmony_ci	/*
30918c2ecf20Sopenharmony_ci	 * A previous I/O error may have been due to temporary
30928c2ecf20Sopenharmony_ci	 * failures.
30938c2ecf20Sopenharmony_ci	 * Clear page error before actual read, PG_error will be
30948c2ecf20Sopenharmony_ci	 * set again if read page fails.
30958c2ecf20Sopenharmony_ci	 */
30968c2ecf20Sopenharmony_ci	ClearPageError(page);
30978c2ecf20Sopenharmony_ci	goto filler;
30988c2ecf20Sopenharmony_ci
30998c2ecf20Sopenharmony_ciout:
31008c2ecf20Sopenharmony_ci	mark_page_accessed(page);
31018c2ecf20Sopenharmony_ci	return page;
31028c2ecf20Sopenharmony_ci}
31038c2ecf20Sopenharmony_ci
31048c2ecf20Sopenharmony_ci/**
31058c2ecf20Sopenharmony_ci * read_cache_page - read into page cache, fill it if needed
31068c2ecf20Sopenharmony_ci * @mapping:	the page's address_space
31078c2ecf20Sopenharmony_ci * @index:	the page index
31088c2ecf20Sopenharmony_ci * @filler:	function to perform the read
31098c2ecf20Sopenharmony_ci * @data:	first arg to filler(data, page) function, often left as NULL
31108c2ecf20Sopenharmony_ci *
31118c2ecf20Sopenharmony_ci * Read into the page cache. If a page already exists, and PageUptodate() is
31128c2ecf20Sopenharmony_ci * not set, try to fill the page and wait for it to become unlocked.
31138c2ecf20Sopenharmony_ci *
31148c2ecf20Sopenharmony_ci * If the page does not get brought uptodate, return -EIO.
31158c2ecf20Sopenharmony_ci *
31168c2ecf20Sopenharmony_ci * Return: up to date page on success, ERR_PTR() on failure.
31178c2ecf20Sopenharmony_ci */
31188c2ecf20Sopenharmony_cistruct page *read_cache_page(struct address_space *mapping,
31198c2ecf20Sopenharmony_ci				pgoff_t index,
31208c2ecf20Sopenharmony_ci				int (*filler)(void *, struct page *),
31218c2ecf20Sopenharmony_ci				void *data)
31228c2ecf20Sopenharmony_ci{
31238c2ecf20Sopenharmony_ci	return do_read_cache_page(mapping, index, filler, data,
31248c2ecf20Sopenharmony_ci			mapping_gfp_mask(mapping));
31258c2ecf20Sopenharmony_ci}
31268c2ecf20Sopenharmony_ciEXPORT_SYMBOL(read_cache_page);
31278c2ecf20Sopenharmony_ci
31288c2ecf20Sopenharmony_ci/**
31298c2ecf20Sopenharmony_ci * read_cache_page_gfp - read into page cache, using specified page allocation flags.
31308c2ecf20Sopenharmony_ci * @mapping:	the page's address_space
31318c2ecf20Sopenharmony_ci * @index:	the page index
31328c2ecf20Sopenharmony_ci * @gfp:	the page allocator flags to use if allocating
31338c2ecf20Sopenharmony_ci *
31348c2ecf20Sopenharmony_ci * This is the same as "read_mapping_page(mapping, index, NULL)", but with
31358c2ecf20Sopenharmony_ci * any new page allocations done using the specified allocation flags.
31368c2ecf20Sopenharmony_ci *
31378c2ecf20Sopenharmony_ci * If the page does not get brought uptodate, return -EIO.
31388c2ecf20Sopenharmony_ci *
31398c2ecf20Sopenharmony_ci * Return: up to date page on success, ERR_PTR() on failure.
31408c2ecf20Sopenharmony_ci */
31418c2ecf20Sopenharmony_cistruct page *read_cache_page_gfp(struct address_space *mapping,
31428c2ecf20Sopenharmony_ci				pgoff_t index,
31438c2ecf20Sopenharmony_ci				gfp_t gfp)
31448c2ecf20Sopenharmony_ci{
31458c2ecf20Sopenharmony_ci	return do_read_cache_page(mapping, index, NULL, NULL, gfp);
31468c2ecf20Sopenharmony_ci}
31478c2ecf20Sopenharmony_ciEXPORT_SYMBOL(read_cache_page_gfp);
31488c2ecf20Sopenharmony_ci
31498c2ecf20Sopenharmony_ciint pagecache_write_begin(struct file *file, struct address_space *mapping,
31508c2ecf20Sopenharmony_ci				loff_t pos, unsigned len, unsigned flags,
31518c2ecf20Sopenharmony_ci				struct page **pagep, void **fsdata)
31528c2ecf20Sopenharmony_ci{
31538c2ecf20Sopenharmony_ci	const struct address_space_operations *aops = mapping->a_ops;
31548c2ecf20Sopenharmony_ci
31558c2ecf20Sopenharmony_ci	return aops->write_begin(file, mapping, pos, len, flags,
31568c2ecf20Sopenharmony_ci							pagep, fsdata);
31578c2ecf20Sopenharmony_ci}
31588c2ecf20Sopenharmony_ciEXPORT_SYMBOL(pagecache_write_begin);
31598c2ecf20Sopenharmony_ci
31608c2ecf20Sopenharmony_ciint pagecache_write_end(struct file *file, struct address_space *mapping,
31618c2ecf20Sopenharmony_ci				loff_t pos, unsigned len, unsigned copied,
31628c2ecf20Sopenharmony_ci				struct page *page, void *fsdata)
31638c2ecf20Sopenharmony_ci{
31648c2ecf20Sopenharmony_ci	const struct address_space_operations *aops = mapping->a_ops;
31658c2ecf20Sopenharmony_ci
31668c2ecf20Sopenharmony_ci	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
31678c2ecf20Sopenharmony_ci}
31688c2ecf20Sopenharmony_ciEXPORT_SYMBOL(pagecache_write_end);
31698c2ecf20Sopenharmony_ci
31708c2ecf20Sopenharmony_ci/*
31718c2ecf20Sopenharmony_ci * Warn about a page cache invalidation failure during a direct I/O write.
31728c2ecf20Sopenharmony_ci */
31738c2ecf20Sopenharmony_civoid dio_warn_stale_pagecache(struct file *filp)
31748c2ecf20Sopenharmony_ci{
31758c2ecf20Sopenharmony_ci	static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
31768c2ecf20Sopenharmony_ci	char pathname[128];
31778c2ecf20Sopenharmony_ci	struct inode *inode = file_inode(filp);
31788c2ecf20Sopenharmony_ci	char *path;
31798c2ecf20Sopenharmony_ci
31808c2ecf20Sopenharmony_ci	errseq_set(&inode->i_mapping->wb_err, -EIO);
31818c2ecf20Sopenharmony_ci	if (__ratelimit(&_rs)) {
31828c2ecf20Sopenharmony_ci		path = file_path(filp, pathname, sizeof(pathname));
31838c2ecf20Sopenharmony_ci		if (IS_ERR(path))
31848c2ecf20Sopenharmony_ci			path = "(unknown)";
31858c2ecf20Sopenharmony_ci		pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
31868c2ecf20Sopenharmony_ci		pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
31878c2ecf20Sopenharmony_ci			current->comm);
31888c2ecf20Sopenharmony_ci	}
31898c2ecf20Sopenharmony_ci}
31908c2ecf20Sopenharmony_ci
31918c2ecf20Sopenharmony_cissize_t
31928c2ecf20Sopenharmony_cigeneric_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
31938c2ecf20Sopenharmony_ci{
31948c2ecf20Sopenharmony_ci	struct file	*file = iocb->ki_filp;
31958c2ecf20Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
31968c2ecf20Sopenharmony_ci	struct inode	*inode = mapping->host;
31978c2ecf20Sopenharmony_ci	loff_t		pos = iocb->ki_pos;
31988c2ecf20Sopenharmony_ci	ssize_t		written;
31998c2ecf20Sopenharmony_ci	size_t		write_len;
32008c2ecf20Sopenharmony_ci	pgoff_t		end;
32018c2ecf20Sopenharmony_ci
32028c2ecf20Sopenharmony_ci	write_len = iov_iter_count(from);
32038c2ecf20Sopenharmony_ci	end = (pos + write_len - 1) >> PAGE_SHIFT;
32048c2ecf20Sopenharmony_ci
32058c2ecf20Sopenharmony_ci	if (iocb->ki_flags & IOCB_NOWAIT) {
32068c2ecf20Sopenharmony_ci		/* If there are pages to writeback, return */
32078c2ecf20Sopenharmony_ci		if (filemap_range_has_page(inode->i_mapping, pos,
32088c2ecf20Sopenharmony_ci					   pos + write_len - 1))
32098c2ecf20Sopenharmony_ci			return -EAGAIN;
32108c2ecf20Sopenharmony_ci	} else {
32118c2ecf20Sopenharmony_ci		written = filemap_write_and_wait_range(mapping, pos,
32128c2ecf20Sopenharmony_ci							pos + write_len - 1);
32138c2ecf20Sopenharmony_ci		if (written)
32148c2ecf20Sopenharmony_ci			goto out;
32158c2ecf20Sopenharmony_ci	}
32168c2ecf20Sopenharmony_ci
32178c2ecf20Sopenharmony_ci	/*
32188c2ecf20Sopenharmony_ci	 * After a write we want buffered reads to be sure to go to disk to get
32198c2ecf20Sopenharmony_ci	 * the new data.  We invalidate clean cached page from the region we're
32208c2ecf20Sopenharmony_ci	 * about to write.  We do this *before* the write so that we can return
32218c2ecf20Sopenharmony_ci	 * without clobbering -EIOCBQUEUED from ->direct_IO().
32228c2ecf20Sopenharmony_ci	 */
32238c2ecf20Sopenharmony_ci	written = invalidate_inode_pages2_range(mapping,
32248c2ecf20Sopenharmony_ci					pos >> PAGE_SHIFT, end);
32258c2ecf20Sopenharmony_ci	/*
32268c2ecf20Sopenharmony_ci	 * If a page can not be invalidated, return 0 to fall back
32278c2ecf20Sopenharmony_ci	 * to buffered write.
32288c2ecf20Sopenharmony_ci	 */
32298c2ecf20Sopenharmony_ci	if (written) {
32308c2ecf20Sopenharmony_ci		if (written == -EBUSY)
32318c2ecf20Sopenharmony_ci			return 0;
32328c2ecf20Sopenharmony_ci		goto out;
32338c2ecf20Sopenharmony_ci	}
32348c2ecf20Sopenharmony_ci
32358c2ecf20Sopenharmony_ci	written = mapping->a_ops->direct_IO(iocb, from);
32368c2ecf20Sopenharmony_ci
32378c2ecf20Sopenharmony_ci	/*
32388c2ecf20Sopenharmony_ci	 * Finally, try again to invalidate clean pages which might have been
32398c2ecf20Sopenharmony_ci	 * cached by non-direct readahead, or faulted in by get_user_pages()
32408c2ecf20Sopenharmony_ci	 * if the source of the write was an mmap'ed region of the file
32418c2ecf20Sopenharmony_ci	 * we're writing.  Either one is a pretty crazy thing to do,
32428c2ecf20Sopenharmony_ci	 * so we don't support it 100%.  If this invalidation
32438c2ecf20Sopenharmony_ci	 * fails, tough, the write still worked...
32448c2ecf20Sopenharmony_ci	 *
32458c2ecf20Sopenharmony_ci	 * Most of the time we do not need this since dio_complete() will do
32468c2ecf20Sopenharmony_ci	 * the invalidation for us. However there are some file systems that
32478c2ecf20Sopenharmony_ci	 * do not end up with dio_complete() being called, so let's not break
32488c2ecf20Sopenharmony_ci	 * them by removing it completely.
32498c2ecf20Sopenharmony_ci	 *
32508c2ecf20Sopenharmony_ci	 * Noticeable example is a blkdev_direct_IO().
32518c2ecf20Sopenharmony_ci	 *
32528c2ecf20Sopenharmony_ci	 * Skip invalidation for async writes or if mapping has no pages.
32538c2ecf20Sopenharmony_ci	 */
32548c2ecf20Sopenharmony_ci	if (written > 0 && mapping->nrpages &&
32558c2ecf20Sopenharmony_ci	    invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
32568c2ecf20Sopenharmony_ci		dio_warn_stale_pagecache(file);
32578c2ecf20Sopenharmony_ci
32588c2ecf20Sopenharmony_ci	if (written > 0) {
32598c2ecf20Sopenharmony_ci		pos += written;
32608c2ecf20Sopenharmony_ci		write_len -= written;
32618c2ecf20Sopenharmony_ci		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
32628c2ecf20Sopenharmony_ci			i_size_write(inode, pos);
32638c2ecf20Sopenharmony_ci			mark_inode_dirty(inode);
32648c2ecf20Sopenharmony_ci		}
32658c2ecf20Sopenharmony_ci		iocb->ki_pos = pos;
32668c2ecf20Sopenharmony_ci	}
32678c2ecf20Sopenharmony_ci	iov_iter_revert(from, write_len - iov_iter_count(from));
32688c2ecf20Sopenharmony_ciout:
32698c2ecf20Sopenharmony_ci	return written;
32708c2ecf20Sopenharmony_ci}
32718c2ecf20Sopenharmony_ciEXPORT_SYMBOL(generic_file_direct_write);
32728c2ecf20Sopenharmony_ci
32738c2ecf20Sopenharmony_ci/*
32748c2ecf20Sopenharmony_ci * Find or create a page at the given pagecache position. Return the locked
32758c2ecf20Sopenharmony_ci * page. This function is specifically for buffered writes.
32768c2ecf20Sopenharmony_ci */
32778c2ecf20Sopenharmony_cistruct page *grab_cache_page_write_begin(struct address_space *mapping,
32788c2ecf20Sopenharmony_ci					pgoff_t index, unsigned flags)
32798c2ecf20Sopenharmony_ci{
32808c2ecf20Sopenharmony_ci	struct page *page;
32818c2ecf20Sopenharmony_ci	int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
32828c2ecf20Sopenharmony_ci
32838c2ecf20Sopenharmony_ci	if (flags & AOP_FLAG_NOFS)
32848c2ecf20Sopenharmony_ci		fgp_flags |= FGP_NOFS;
32858c2ecf20Sopenharmony_ci
32868c2ecf20Sopenharmony_ci	page = pagecache_get_page(mapping, index, fgp_flags,
32878c2ecf20Sopenharmony_ci			mapping_gfp_mask(mapping));
32888c2ecf20Sopenharmony_ci	if (page)
32898c2ecf20Sopenharmony_ci		wait_for_stable_page(page);
32908c2ecf20Sopenharmony_ci
32918c2ecf20Sopenharmony_ci	return page;
32928c2ecf20Sopenharmony_ci}
32938c2ecf20Sopenharmony_ciEXPORT_SYMBOL(grab_cache_page_write_begin);
32948c2ecf20Sopenharmony_ci
32958c2ecf20Sopenharmony_cissize_t generic_perform_write(struct file *file,
32968c2ecf20Sopenharmony_ci				struct iov_iter *i, loff_t pos)
32978c2ecf20Sopenharmony_ci{
32988c2ecf20Sopenharmony_ci	struct address_space *mapping = file->f_mapping;
32998c2ecf20Sopenharmony_ci	const struct address_space_operations *a_ops = mapping->a_ops;
33008c2ecf20Sopenharmony_ci	long status = 0;
33018c2ecf20Sopenharmony_ci	ssize_t written = 0;
33028c2ecf20Sopenharmony_ci	unsigned int flags = 0;
33038c2ecf20Sopenharmony_ci
33048c2ecf20Sopenharmony_ci	do {
33058c2ecf20Sopenharmony_ci		struct page *page;
33068c2ecf20Sopenharmony_ci		unsigned long offset;	/* Offset into pagecache page */
33078c2ecf20Sopenharmony_ci		unsigned long bytes;	/* Bytes to write to page */
33088c2ecf20Sopenharmony_ci		size_t copied;		/* Bytes copied from user */
33098c2ecf20Sopenharmony_ci		void *fsdata = NULL;
33108c2ecf20Sopenharmony_ci
33118c2ecf20Sopenharmony_ci		offset = (pos & (PAGE_SIZE - 1));
33128c2ecf20Sopenharmony_ci		bytes = min_t(unsigned long, PAGE_SIZE - offset,
33138c2ecf20Sopenharmony_ci						iov_iter_count(i));
33148c2ecf20Sopenharmony_ci
33158c2ecf20Sopenharmony_ciagain:
33168c2ecf20Sopenharmony_ci		/*
33178c2ecf20Sopenharmony_ci		 * Bring in the user page that we will copy from _first_.
33188c2ecf20Sopenharmony_ci		 * Otherwise there's a nasty deadlock on copying from the
33198c2ecf20Sopenharmony_ci		 * same page as we're writing to, without it being marked
33208c2ecf20Sopenharmony_ci		 * up-to-date.
33218c2ecf20Sopenharmony_ci		 *
33228c2ecf20Sopenharmony_ci		 * Not only is this an optimisation, but it is also required
33238c2ecf20Sopenharmony_ci		 * to check that the address is actually valid, when atomic
33248c2ecf20Sopenharmony_ci		 * usercopies are used, below.
33258c2ecf20Sopenharmony_ci		 */
33268c2ecf20Sopenharmony_ci		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
33278c2ecf20Sopenharmony_ci			status = -EFAULT;
33288c2ecf20Sopenharmony_ci			break;
33298c2ecf20Sopenharmony_ci		}
33308c2ecf20Sopenharmony_ci
33318c2ecf20Sopenharmony_ci		if (fatal_signal_pending(current)) {
33328c2ecf20Sopenharmony_ci			status = -EINTR;
33338c2ecf20Sopenharmony_ci			break;
33348c2ecf20Sopenharmony_ci		}
33358c2ecf20Sopenharmony_ci
33368c2ecf20Sopenharmony_ci		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
33378c2ecf20Sopenharmony_ci						&page, &fsdata);
33388c2ecf20Sopenharmony_ci		if (unlikely(status < 0))
33398c2ecf20Sopenharmony_ci			break;
33408c2ecf20Sopenharmony_ci
33418c2ecf20Sopenharmony_ci		if (mapping_writably_mapped(mapping))
33428c2ecf20Sopenharmony_ci			flush_dcache_page(page);
33438c2ecf20Sopenharmony_ci
33448c2ecf20Sopenharmony_ci		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
33458c2ecf20Sopenharmony_ci		flush_dcache_page(page);
33468c2ecf20Sopenharmony_ci
33478c2ecf20Sopenharmony_ci		status = a_ops->write_end(file, mapping, pos, bytes, copied,
33488c2ecf20Sopenharmony_ci						page, fsdata);
33498c2ecf20Sopenharmony_ci		if (unlikely(status < 0))
33508c2ecf20Sopenharmony_ci			break;
33518c2ecf20Sopenharmony_ci		copied = status;
33528c2ecf20Sopenharmony_ci
33538c2ecf20Sopenharmony_ci		cond_resched();
33548c2ecf20Sopenharmony_ci
33558c2ecf20Sopenharmony_ci		iov_iter_advance(i, copied);
33568c2ecf20Sopenharmony_ci		if (unlikely(copied == 0)) {
33578c2ecf20Sopenharmony_ci			/*
33588c2ecf20Sopenharmony_ci			 * If we were unable to copy any data at all, we must
33598c2ecf20Sopenharmony_ci			 * fall back to a single segment length write.
33608c2ecf20Sopenharmony_ci			 *
33618c2ecf20Sopenharmony_ci			 * If we didn't fallback here, we could livelock
33628c2ecf20Sopenharmony_ci			 * because not all segments in the iov can be copied at
33638c2ecf20Sopenharmony_ci			 * once without a pagefault.
33648c2ecf20Sopenharmony_ci			 */
33658c2ecf20Sopenharmony_ci			bytes = min_t(unsigned long, PAGE_SIZE - offset,
33668c2ecf20Sopenharmony_ci						iov_iter_single_seg_count(i));
33678c2ecf20Sopenharmony_ci			goto again;
33688c2ecf20Sopenharmony_ci		}
33698c2ecf20Sopenharmony_ci		pos += copied;
33708c2ecf20Sopenharmony_ci		written += copied;
33718c2ecf20Sopenharmony_ci
33728c2ecf20Sopenharmony_ci		balance_dirty_pages_ratelimited(mapping);
33738c2ecf20Sopenharmony_ci	} while (iov_iter_count(i));
33748c2ecf20Sopenharmony_ci
33758c2ecf20Sopenharmony_ci	return written ? written : status;
33768c2ecf20Sopenharmony_ci}
33778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(generic_perform_write);
33788c2ecf20Sopenharmony_ci
33798c2ecf20Sopenharmony_ci/**
33808c2ecf20Sopenharmony_ci * __generic_file_write_iter - write data to a file
33818c2ecf20Sopenharmony_ci * @iocb:	IO state structure (file, offset, etc.)
33828c2ecf20Sopenharmony_ci * @from:	iov_iter with data to write
33838c2ecf20Sopenharmony_ci *
33848c2ecf20Sopenharmony_ci * This function does all the work needed for actually writing data to a
33858c2ecf20Sopenharmony_ci * file. It does all basic checks, removes SUID from the file, updates
33868c2ecf20Sopenharmony_ci * modification times and calls proper subroutines depending on whether we
33878c2ecf20Sopenharmony_ci * do direct IO or a standard buffered write.
33888c2ecf20Sopenharmony_ci *
33898c2ecf20Sopenharmony_ci * It expects i_mutex to be grabbed unless we work on a block device or similar
33908c2ecf20Sopenharmony_ci * object which does not need locking at all.
33918c2ecf20Sopenharmony_ci *
33928c2ecf20Sopenharmony_ci * This function does *not* take care of syncing data in case of O_SYNC write.
33938c2ecf20Sopenharmony_ci * A caller has to handle it. This is mainly due to the fact that we want to
33948c2ecf20Sopenharmony_ci * avoid syncing under i_mutex.
33958c2ecf20Sopenharmony_ci *
33968c2ecf20Sopenharmony_ci * Return:
33978c2ecf20Sopenharmony_ci * * number of bytes written, even for truncated writes
33988c2ecf20Sopenharmony_ci * * negative error code if no data has been written at all
33998c2ecf20Sopenharmony_ci */
34008c2ecf20Sopenharmony_cissize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
34018c2ecf20Sopenharmony_ci{
34028c2ecf20Sopenharmony_ci	struct file *file = iocb->ki_filp;
34038c2ecf20Sopenharmony_ci	struct address_space * mapping = file->f_mapping;
34048c2ecf20Sopenharmony_ci	struct inode 	*inode = mapping->host;
34058c2ecf20Sopenharmony_ci	ssize_t		written = 0;
34068c2ecf20Sopenharmony_ci	ssize_t		err;
34078c2ecf20Sopenharmony_ci	ssize_t		status;
34088c2ecf20Sopenharmony_ci
34098c2ecf20Sopenharmony_ci	/* We can write back this queue in page reclaim */
34108c2ecf20Sopenharmony_ci	current->backing_dev_info = inode_to_bdi(inode);
34118c2ecf20Sopenharmony_ci	err = file_remove_privs(file);
34128c2ecf20Sopenharmony_ci	if (err)
34138c2ecf20Sopenharmony_ci		goto out;
34148c2ecf20Sopenharmony_ci
34158c2ecf20Sopenharmony_ci	err = file_update_time(file);
34168c2ecf20Sopenharmony_ci	if (err)
34178c2ecf20Sopenharmony_ci		goto out;
34188c2ecf20Sopenharmony_ci
34198c2ecf20Sopenharmony_ci	if (iocb->ki_flags & IOCB_DIRECT) {
34208c2ecf20Sopenharmony_ci		loff_t pos, endbyte;
34218c2ecf20Sopenharmony_ci
34228c2ecf20Sopenharmony_ci		written = generic_file_direct_write(iocb, from);
34238c2ecf20Sopenharmony_ci		/*
34248c2ecf20Sopenharmony_ci		 * If the write stopped short of completing, fall back to
34258c2ecf20Sopenharmony_ci		 * buffered writes.  Some filesystems do this for writes to
34268c2ecf20Sopenharmony_ci		 * holes, for example.  For DAX files, a buffered write will
34278c2ecf20Sopenharmony_ci		 * not succeed (even if it did, DAX does not handle dirty
34288c2ecf20Sopenharmony_ci		 * page-cache pages correctly).
34298c2ecf20Sopenharmony_ci		 */
34308c2ecf20Sopenharmony_ci		if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
34318c2ecf20Sopenharmony_ci			goto out;
34328c2ecf20Sopenharmony_ci
34338c2ecf20Sopenharmony_ci		status = generic_perform_write(file, from, pos = iocb->ki_pos);
34348c2ecf20Sopenharmony_ci		/*
34358c2ecf20Sopenharmony_ci		 * If generic_perform_write() returned a synchronous error
34368c2ecf20Sopenharmony_ci		 * then we want to return the number of bytes which were
34378c2ecf20Sopenharmony_ci		 * direct-written, or the error code if that was zero.  Note
34388c2ecf20Sopenharmony_ci		 * that this differs from normal direct-io semantics, which
34398c2ecf20Sopenharmony_ci		 * will return -EFOO even if some bytes were written.
34408c2ecf20Sopenharmony_ci		 */
34418c2ecf20Sopenharmony_ci		if (unlikely(status < 0)) {
34428c2ecf20Sopenharmony_ci			err = status;
34438c2ecf20Sopenharmony_ci			goto out;
34448c2ecf20Sopenharmony_ci		}
34458c2ecf20Sopenharmony_ci		/*
34468c2ecf20Sopenharmony_ci		 * We need to ensure that the page cache pages are written to
34478c2ecf20Sopenharmony_ci		 * disk and invalidated to preserve the expected O_DIRECT
34488c2ecf20Sopenharmony_ci		 * semantics.
34498c2ecf20Sopenharmony_ci		 */
34508c2ecf20Sopenharmony_ci		endbyte = pos + status - 1;
34518c2ecf20Sopenharmony_ci		err = filemap_write_and_wait_range(mapping, pos, endbyte);
34528c2ecf20Sopenharmony_ci		if (err == 0) {
34538c2ecf20Sopenharmony_ci			iocb->ki_pos = endbyte + 1;
34548c2ecf20Sopenharmony_ci			written += status;
34558c2ecf20Sopenharmony_ci			invalidate_mapping_pages(mapping,
34568c2ecf20Sopenharmony_ci						 pos >> PAGE_SHIFT,
34578c2ecf20Sopenharmony_ci						 endbyte >> PAGE_SHIFT);
34588c2ecf20Sopenharmony_ci		} else {
34598c2ecf20Sopenharmony_ci			/*
34608c2ecf20Sopenharmony_ci			 * We don't know how much we wrote, so just return
34618c2ecf20Sopenharmony_ci			 * the number of bytes which were direct-written
34628c2ecf20Sopenharmony_ci			 */
34638c2ecf20Sopenharmony_ci		}
34648c2ecf20Sopenharmony_ci	} else {
34658c2ecf20Sopenharmony_ci		written = generic_perform_write(file, from, iocb->ki_pos);
34668c2ecf20Sopenharmony_ci		if (likely(written > 0))
34678c2ecf20Sopenharmony_ci			iocb->ki_pos += written;
34688c2ecf20Sopenharmony_ci	}
34698c2ecf20Sopenharmony_ciout:
34708c2ecf20Sopenharmony_ci	current->backing_dev_info = NULL;
34718c2ecf20Sopenharmony_ci	return written ? written : err;
34728c2ecf20Sopenharmony_ci}
34738c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__generic_file_write_iter);
34748c2ecf20Sopenharmony_ci
34758c2ecf20Sopenharmony_ci/**
34768c2ecf20Sopenharmony_ci * generic_file_write_iter - write data to a file
34778c2ecf20Sopenharmony_ci * @iocb:	IO state structure
34788c2ecf20Sopenharmony_ci * @from:	iov_iter with data to write
34798c2ecf20Sopenharmony_ci *
34808c2ecf20Sopenharmony_ci * This is a wrapper around __generic_file_write_iter() to be used by most
34818c2ecf20Sopenharmony_ci * filesystems. It takes care of syncing the file in case of O_SYNC file
34828c2ecf20Sopenharmony_ci * and acquires i_mutex as needed.
34838c2ecf20Sopenharmony_ci * Return:
34848c2ecf20Sopenharmony_ci * * negative error code if no data has been written at all of
34858c2ecf20Sopenharmony_ci *   vfs_fsync_range() failed for a synchronous write
34868c2ecf20Sopenharmony_ci * * number of bytes written, even for truncated writes
34878c2ecf20Sopenharmony_ci */
34888c2ecf20Sopenharmony_cissize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
34898c2ecf20Sopenharmony_ci{
34908c2ecf20Sopenharmony_ci	struct file *file = iocb->ki_filp;
34918c2ecf20Sopenharmony_ci	struct inode *inode = file->f_mapping->host;
34928c2ecf20Sopenharmony_ci	ssize_t ret;
34938c2ecf20Sopenharmony_ci
34948c2ecf20Sopenharmony_ci	inode_lock(inode);
34958c2ecf20Sopenharmony_ci	ret = generic_write_checks(iocb, from);
34968c2ecf20Sopenharmony_ci	if (ret > 0)
34978c2ecf20Sopenharmony_ci		ret = __generic_file_write_iter(iocb, from);
34988c2ecf20Sopenharmony_ci	inode_unlock(inode);
34998c2ecf20Sopenharmony_ci
35008c2ecf20Sopenharmony_ci	if (ret > 0)
35018c2ecf20Sopenharmony_ci		ret = generic_write_sync(iocb, ret);
35028c2ecf20Sopenharmony_ci	return ret;
35038c2ecf20Sopenharmony_ci}
35048c2ecf20Sopenharmony_ciEXPORT_SYMBOL(generic_file_write_iter);
35058c2ecf20Sopenharmony_ci
35068c2ecf20Sopenharmony_ci/**
35078c2ecf20Sopenharmony_ci * try_to_release_page() - release old fs-specific metadata on a page
35088c2ecf20Sopenharmony_ci *
35098c2ecf20Sopenharmony_ci * @page: the page which the kernel is trying to free
35108c2ecf20Sopenharmony_ci * @gfp_mask: memory allocation flags (and I/O mode)
35118c2ecf20Sopenharmony_ci *
35128c2ecf20Sopenharmony_ci * The address_space is to try to release any data against the page
35138c2ecf20Sopenharmony_ci * (presumably at page->private).
35148c2ecf20Sopenharmony_ci *
35158c2ecf20Sopenharmony_ci * This may also be called if PG_fscache is set on a page, indicating that the
35168c2ecf20Sopenharmony_ci * page is known to the local caching routines.
35178c2ecf20Sopenharmony_ci *
35188c2ecf20Sopenharmony_ci * The @gfp_mask argument specifies whether I/O may be performed to release
35198c2ecf20Sopenharmony_ci * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
35208c2ecf20Sopenharmony_ci *
35218c2ecf20Sopenharmony_ci * Return: %1 if the release was successful, otherwise return zero.
35228c2ecf20Sopenharmony_ci */
35238c2ecf20Sopenharmony_ciint try_to_release_page(struct page *page, gfp_t gfp_mask)
35248c2ecf20Sopenharmony_ci{
35258c2ecf20Sopenharmony_ci	struct address_space * const mapping = page->mapping;
35268c2ecf20Sopenharmony_ci
35278c2ecf20Sopenharmony_ci	BUG_ON(!PageLocked(page));
35288c2ecf20Sopenharmony_ci	if (PageWriteback(page))
35298c2ecf20Sopenharmony_ci		return 0;
35308c2ecf20Sopenharmony_ci
35318c2ecf20Sopenharmony_ci	if (mapping && mapping->a_ops->releasepage)
35328c2ecf20Sopenharmony_ci		return mapping->a_ops->releasepage(page, gfp_mask);
35338c2ecf20Sopenharmony_ci	return try_to_free_buffers(page);
35348c2ecf20Sopenharmony_ci}
35358c2ecf20Sopenharmony_ci
35368c2ecf20Sopenharmony_ciEXPORT_SYMBOL(try_to_release_page);
3537