162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * fs/dax.c - Direct Access filesystem code
462306a36Sopenharmony_ci * Copyright (c) 2013-2014 Intel Corporation
562306a36Sopenharmony_ci * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
662306a36Sopenharmony_ci * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#include <linux/atomic.h>
1062306a36Sopenharmony_ci#include <linux/blkdev.h>
1162306a36Sopenharmony_ci#include <linux/buffer_head.h>
1262306a36Sopenharmony_ci#include <linux/dax.h>
1362306a36Sopenharmony_ci#include <linux/fs.h>
1462306a36Sopenharmony_ci#include <linux/highmem.h>
1562306a36Sopenharmony_ci#include <linux/memcontrol.h>
1662306a36Sopenharmony_ci#include <linux/mm.h>
1762306a36Sopenharmony_ci#include <linux/mutex.h>
1862306a36Sopenharmony_ci#include <linux/pagevec.h>
1962306a36Sopenharmony_ci#include <linux/sched.h>
2062306a36Sopenharmony_ci#include <linux/sched/signal.h>
2162306a36Sopenharmony_ci#include <linux/uio.h>
2262306a36Sopenharmony_ci#include <linux/vmstat.h>
2362306a36Sopenharmony_ci#include <linux/pfn_t.h>
2462306a36Sopenharmony_ci#include <linux/sizes.h>
2562306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
2662306a36Sopenharmony_ci#include <linux/iomap.h>
2762306a36Sopenharmony_ci#include <linux/rmap.h>
2862306a36Sopenharmony_ci#include <asm/pgalloc.h>
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
3162306a36Sopenharmony_ci#include <trace/events/fs_dax.h>
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci/* We choose 4096 entries - same as per-zone page wait tables */
3462306a36Sopenharmony_ci#define DAX_WAIT_TABLE_BITS 12
3562306a36Sopenharmony_ci#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
3662306a36Sopenharmony_ci
3762306a36Sopenharmony_ci/* The 'colour' (ie low bits) within a PMD of a page offset.  */
3862306a36Sopenharmony_ci#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
3962306a36Sopenharmony_ci#define PG_PMD_NR	(PMD_SIZE >> PAGE_SHIFT)
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_cistatic wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_cistatic int __init init_dax_wait_table(void)
4462306a36Sopenharmony_ci{
4562306a36Sopenharmony_ci	int i;
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
4862306a36Sopenharmony_ci		init_waitqueue_head(wait_table + i);
4962306a36Sopenharmony_ci	return 0;
5062306a36Sopenharmony_ci}
5162306a36Sopenharmony_cifs_initcall(init_dax_wait_table);
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci/*
5462306a36Sopenharmony_ci * DAX pagecache entries use XArray value entries so they can't be mistaken
5562306a36Sopenharmony_ci * for pages.  We use one bit for locking, one bit for the entry size (PMD)
5662306a36Sopenharmony_ci * and two more to tell us if the entry is a zero page or an empty entry that
5762306a36Sopenharmony_ci * is just used for locking.  In total four special bits.
5862306a36Sopenharmony_ci *
5962306a36Sopenharmony_ci * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
6062306a36Sopenharmony_ci * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
6162306a36Sopenharmony_ci * block allocation.
6262306a36Sopenharmony_ci */
6362306a36Sopenharmony_ci#define DAX_SHIFT	(4)
6462306a36Sopenharmony_ci#define DAX_LOCKED	(1UL << 0)
6562306a36Sopenharmony_ci#define DAX_PMD		(1UL << 1)
6662306a36Sopenharmony_ci#define DAX_ZERO_PAGE	(1UL << 2)
6762306a36Sopenharmony_ci#define DAX_EMPTY	(1UL << 3)
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_cistatic unsigned long dax_to_pfn(void *entry)
7062306a36Sopenharmony_ci{
7162306a36Sopenharmony_ci	return xa_to_value(entry) >> DAX_SHIFT;
7262306a36Sopenharmony_ci}
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_cistatic void *dax_make_entry(pfn_t pfn, unsigned long flags)
7562306a36Sopenharmony_ci{
7662306a36Sopenharmony_ci	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
7762306a36Sopenharmony_ci}
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_cistatic bool dax_is_locked(void *entry)
8062306a36Sopenharmony_ci{
8162306a36Sopenharmony_ci	return xa_to_value(entry) & DAX_LOCKED;
8262306a36Sopenharmony_ci}
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_cistatic unsigned int dax_entry_order(void *entry)
8562306a36Sopenharmony_ci{
8662306a36Sopenharmony_ci	if (xa_to_value(entry) & DAX_PMD)
8762306a36Sopenharmony_ci		return PMD_ORDER;
8862306a36Sopenharmony_ci	return 0;
8962306a36Sopenharmony_ci}
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_cistatic unsigned long dax_is_pmd_entry(void *entry)
9262306a36Sopenharmony_ci{
9362306a36Sopenharmony_ci	return xa_to_value(entry) & DAX_PMD;
9462306a36Sopenharmony_ci}
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_cistatic bool dax_is_pte_entry(void *entry)
9762306a36Sopenharmony_ci{
9862306a36Sopenharmony_ci	return !(xa_to_value(entry) & DAX_PMD);
9962306a36Sopenharmony_ci}
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_cistatic int dax_is_zero_entry(void *entry)
10262306a36Sopenharmony_ci{
10362306a36Sopenharmony_ci	return xa_to_value(entry) & DAX_ZERO_PAGE;
10462306a36Sopenharmony_ci}
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_cistatic int dax_is_empty_entry(void *entry)
10762306a36Sopenharmony_ci{
10862306a36Sopenharmony_ci	return xa_to_value(entry) & DAX_EMPTY;
10962306a36Sopenharmony_ci}
11062306a36Sopenharmony_ci
11162306a36Sopenharmony_ci/*
11262306a36Sopenharmony_ci * true if the entry that was found is of a smaller order than the entry
11362306a36Sopenharmony_ci * we were looking for
11462306a36Sopenharmony_ci */
11562306a36Sopenharmony_cistatic bool dax_is_conflict(void *entry)
11662306a36Sopenharmony_ci{
11762306a36Sopenharmony_ci	return entry == XA_RETRY_ENTRY;
11862306a36Sopenharmony_ci}
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci/*
12162306a36Sopenharmony_ci * DAX page cache entry locking
12262306a36Sopenharmony_ci */
12362306a36Sopenharmony_cistruct exceptional_entry_key {
12462306a36Sopenharmony_ci	struct xarray *xa;
12562306a36Sopenharmony_ci	pgoff_t entry_start;
12662306a36Sopenharmony_ci};
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_cistruct wait_exceptional_entry_queue {
12962306a36Sopenharmony_ci	wait_queue_entry_t wait;
13062306a36Sopenharmony_ci	struct exceptional_entry_key key;
13162306a36Sopenharmony_ci};
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci/**
13462306a36Sopenharmony_ci * enum dax_wake_mode: waitqueue wakeup behaviour
13562306a36Sopenharmony_ci * @WAKE_ALL: wake all waiters in the waitqueue
13662306a36Sopenharmony_ci * @WAKE_NEXT: wake only the first waiter in the waitqueue
13762306a36Sopenharmony_ci */
13862306a36Sopenharmony_cienum dax_wake_mode {
13962306a36Sopenharmony_ci	WAKE_ALL,
14062306a36Sopenharmony_ci	WAKE_NEXT,
14162306a36Sopenharmony_ci};
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_cistatic wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
14462306a36Sopenharmony_ci		void *entry, struct exceptional_entry_key *key)
14562306a36Sopenharmony_ci{
14662306a36Sopenharmony_ci	unsigned long hash;
14762306a36Sopenharmony_ci	unsigned long index = xas->xa_index;
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci	/*
15062306a36Sopenharmony_ci	 * If 'entry' is a PMD, align the 'index' that we use for the wait
15162306a36Sopenharmony_ci	 * queue to the start of that PMD.  This ensures that all offsets in
15262306a36Sopenharmony_ci	 * the range covered by the PMD map to the same bit lock.
15362306a36Sopenharmony_ci	 */
15462306a36Sopenharmony_ci	if (dax_is_pmd_entry(entry))
15562306a36Sopenharmony_ci		index &= ~PG_PMD_COLOUR;
15662306a36Sopenharmony_ci	key->xa = xas->xa;
15762306a36Sopenharmony_ci	key->entry_start = index;
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
16062306a36Sopenharmony_ci	return wait_table + hash;
16162306a36Sopenharmony_ci}
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_cistatic int wake_exceptional_entry_func(wait_queue_entry_t *wait,
16462306a36Sopenharmony_ci		unsigned int mode, int sync, void *keyp)
16562306a36Sopenharmony_ci{
16662306a36Sopenharmony_ci	struct exceptional_entry_key *key = keyp;
16762306a36Sopenharmony_ci	struct wait_exceptional_entry_queue *ewait =
16862306a36Sopenharmony_ci		container_of(wait, struct wait_exceptional_entry_queue, wait);
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	if (key->xa != ewait->key.xa ||
17162306a36Sopenharmony_ci	    key->entry_start != ewait->key.entry_start)
17262306a36Sopenharmony_ci		return 0;
17362306a36Sopenharmony_ci	return autoremove_wake_function(wait, mode, sync, NULL);
17462306a36Sopenharmony_ci}
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci/*
17762306a36Sopenharmony_ci * @entry may no longer be the entry at the index in the mapping.
17862306a36Sopenharmony_ci * The important information it's conveying is whether the entry at
17962306a36Sopenharmony_ci * this index used to be a PMD entry.
18062306a36Sopenharmony_ci */
18162306a36Sopenharmony_cistatic void dax_wake_entry(struct xa_state *xas, void *entry,
18262306a36Sopenharmony_ci			   enum dax_wake_mode mode)
18362306a36Sopenharmony_ci{
18462306a36Sopenharmony_ci	struct exceptional_entry_key key;
18562306a36Sopenharmony_ci	wait_queue_head_t *wq;
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci	wq = dax_entry_waitqueue(xas, entry, &key);
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	/*
19062306a36Sopenharmony_ci	 * Checking for locked entry and prepare_to_wait_exclusive() happens
19162306a36Sopenharmony_ci	 * under the i_pages lock, ditto for entry handling in our callers.
19262306a36Sopenharmony_ci	 * So at this point all tasks that could have seen our entry locked
19362306a36Sopenharmony_ci	 * must be in the waitqueue and the following check will see them.
19462306a36Sopenharmony_ci	 */
19562306a36Sopenharmony_ci	if (waitqueue_active(wq))
19662306a36Sopenharmony_ci		__wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
19762306a36Sopenharmony_ci}
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_ci/*
20062306a36Sopenharmony_ci * Look up entry in page cache, wait for it to become unlocked if it
20162306a36Sopenharmony_ci * is a DAX entry and return it.  The caller must subsequently call
20262306a36Sopenharmony_ci * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
20362306a36Sopenharmony_ci * if it did.  The entry returned may have a larger order than @order.
20462306a36Sopenharmony_ci * If @order is larger than the order of the entry found in i_pages, this
20562306a36Sopenharmony_ci * function returns a dax_is_conflict entry.
20662306a36Sopenharmony_ci *
20762306a36Sopenharmony_ci * Must be called with the i_pages lock held.
20862306a36Sopenharmony_ci */
20962306a36Sopenharmony_cistatic void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
21062306a36Sopenharmony_ci{
21162306a36Sopenharmony_ci	void *entry;
21262306a36Sopenharmony_ci	struct wait_exceptional_entry_queue ewait;
21362306a36Sopenharmony_ci	wait_queue_head_t *wq;
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci	init_wait(&ewait.wait);
21662306a36Sopenharmony_ci	ewait.wait.func = wake_exceptional_entry_func;
21762306a36Sopenharmony_ci
21862306a36Sopenharmony_ci	for (;;) {
21962306a36Sopenharmony_ci		entry = xas_find_conflict(xas);
22062306a36Sopenharmony_ci		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
22162306a36Sopenharmony_ci			return entry;
22262306a36Sopenharmony_ci		if (dax_entry_order(entry) < order)
22362306a36Sopenharmony_ci			return XA_RETRY_ENTRY;
22462306a36Sopenharmony_ci		if (!dax_is_locked(entry))
22562306a36Sopenharmony_ci			return entry;
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
22862306a36Sopenharmony_ci		prepare_to_wait_exclusive(wq, &ewait.wait,
22962306a36Sopenharmony_ci					  TASK_UNINTERRUPTIBLE);
23062306a36Sopenharmony_ci		xas_unlock_irq(xas);
23162306a36Sopenharmony_ci		xas_reset(xas);
23262306a36Sopenharmony_ci		schedule();
23362306a36Sopenharmony_ci		finish_wait(wq, &ewait.wait);
23462306a36Sopenharmony_ci		xas_lock_irq(xas);
23562306a36Sopenharmony_ci	}
23662306a36Sopenharmony_ci}
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci/*
23962306a36Sopenharmony_ci * The only thing keeping the address space around is the i_pages lock
24062306a36Sopenharmony_ci * (it's cycled in clear_inode() after removing the entries from i_pages)
24162306a36Sopenharmony_ci * After we call xas_unlock_irq(), we cannot touch xas->xa.
24262306a36Sopenharmony_ci */
24362306a36Sopenharmony_cistatic void wait_entry_unlocked(struct xa_state *xas, void *entry)
24462306a36Sopenharmony_ci{
24562306a36Sopenharmony_ci	struct wait_exceptional_entry_queue ewait;
24662306a36Sopenharmony_ci	wait_queue_head_t *wq;
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci	init_wait(&ewait.wait);
24962306a36Sopenharmony_ci	ewait.wait.func = wake_exceptional_entry_func;
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci	wq = dax_entry_waitqueue(xas, entry, &ewait.key);
25262306a36Sopenharmony_ci	/*
25362306a36Sopenharmony_ci	 * Unlike get_unlocked_entry() there is no guarantee that this
25462306a36Sopenharmony_ci	 * path ever successfully retrieves an unlocked entry before an
25562306a36Sopenharmony_ci	 * inode dies. Perform a non-exclusive wait in case this path
25662306a36Sopenharmony_ci	 * never successfully performs its own wake up.
25762306a36Sopenharmony_ci	 */
25862306a36Sopenharmony_ci	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
25962306a36Sopenharmony_ci	xas_unlock_irq(xas);
26062306a36Sopenharmony_ci	schedule();
26162306a36Sopenharmony_ci	finish_wait(wq, &ewait.wait);
26262306a36Sopenharmony_ci}
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_cistatic void put_unlocked_entry(struct xa_state *xas, void *entry,
26562306a36Sopenharmony_ci			       enum dax_wake_mode mode)
26662306a36Sopenharmony_ci{
26762306a36Sopenharmony_ci	if (entry && !dax_is_conflict(entry))
26862306a36Sopenharmony_ci		dax_wake_entry(xas, entry, mode);
26962306a36Sopenharmony_ci}
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_ci/*
27262306a36Sopenharmony_ci * We used the xa_state to get the entry, but then we locked the entry and
27362306a36Sopenharmony_ci * dropped the xa_lock, so we know the xa_state is stale and must be reset
27462306a36Sopenharmony_ci * before use.
27562306a36Sopenharmony_ci */
27662306a36Sopenharmony_cistatic void dax_unlock_entry(struct xa_state *xas, void *entry)
27762306a36Sopenharmony_ci{
27862306a36Sopenharmony_ci	void *old;
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	BUG_ON(dax_is_locked(entry));
28162306a36Sopenharmony_ci	xas_reset(xas);
28262306a36Sopenharmony_ci	xas_lock_irq(xas);
28362306a36Sopenharmony_ci	old = xas_store(xas, entry);
28462306a36Sopenharmony_ci	xas_unlock_irq(xas);
28562306a36Sopenharmony_ci	BUG_ON(!dax_is_locked(old));
28662306a36Sopenharmony_ci	dax_wake_entry(xas, entry, WAKE_NEXT);
28762306a36Sopenharmony_ci}
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci/*
29062306a36Sopenharmony_ci * Return: The entry stored at this location before it was locked.
29162306a36Sopenharmony_ci */
29262306a36Sopenharmony_cistatic void *dax_lock_entry(struct xa_state *xas, void *entry)
29362306a36Sopenharmony_ci{
29462306a36Sopenharmony_ci	unsigned long v = xa_to_value(entry);
29562306a36Sopenharmony_ci	return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
29662306a36Sopenharmony_ci}
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_cistatic unsigned long dax_entry_size(void *entry)
29962306a36Sopenharmony_ci{
30062306a36Sopenharmony_ci	if (dax_is_zero_entry(entry))
30162306a36Sopenharmony_ci		return 0;
30262306a36Sopenharmony_ci	else if (dax_is_empty_entry(entry))
30362306a36Sopenharmony_ci		return 0;
30462306a36Sopenharmony_ci	else if (dax_is_pmd_entry(entry))
30562306a36Sopenharmony_ci		return PMD_SIZE;
30662306a36Sopenharmony_ci	else
30762306a36Sopenharmony_ci		return PAGE_SIZE;
30862306a36Sopenharmony_ci}
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_cistatic unsigned long dax_end_pfn(void *entry)
31162306a36Sopenharmony_ci{
31262306a36Sopenharmony_ci	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
31362306a36Sopenharmony_ci}
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_ci/*
31662306a36Sopenharmony_ci * Iterate through all mapped pfns represented by an entry, i.e. skip
31762306a36Sopenharmony_ci * 'empty' and 'zero' entries.
31862306a36Sopenharmony_ci */
31962306a36Sopenharmony_ci#define for_each_mapped_pfn(entry, pfn) \
32062306a36Sopenharmony_ci	for (pfn = dax_to_pfn(entry); \
32162306a36Sopenharmony_ci			pfn < dax_end_pfn(entry); pfn++)
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_cistatic inline bool dax_page_is_shared(struct page *page)
32462306a36Sopenharmony_ci{
32562306a36Sopenharmony_ci	return page->mapping == PAGE_MAPPING_DAX_SHARED;
32662306a36Sopenharmony_ci}
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_ci/*
32962306a36Sopenharmony_ci * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
33062306a36Sopenharmony_ci * refcount.
33162306a36Sopenharmony_ci */
33262306a36Sopenharmony_cistatic inline void dax_page_share_get(struct page *page)
33362306a36Sopenharmony_ci{
33462306a36Sopenharmony_ci	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
33562306a36Sopenharmony_ci		/*
33662306a36Sopenharmony_ci		 * Reset the index if the page was already mapped
33762306a36Sopenharmony_ci		 * regularly before.
33862306a36Sopenharmony_ci		 */
33962306a36Sopenharmony_ci		if (page->mapping)
34062306a36Sopenharmony_ci			page->share = 1;
34162306a36Sopenharmony_ci		page->mapping = PAGE_MAPPING_DAX_SHARED;
34262306a36Sopenharmony_ci	}
34362306a36Sopenharmony_ci	page->share++;
34462306a36Sopenharmony_ci}
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_cistatic inline unsigned long dax_page_share_put(struct page *page)
34762306a36Sopenharmony_ci{
34862306a36Sopenharmony_ci	return --page->share;
34962306a36Sopenharmony_ci}
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci/*
35262306a36Sopenharmony_ci * When it is called in dax_insert_entry(), the shared flag will indicate that
35362306a36Sopenharmony_ci * whether this entry is shared by multiple files.  If so, set the page->mapping
35462306a36Sopenharmony_ci * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
35562306a36Sopenharmony_ci */
35662306a36Sopenharmony_cistatic void dax_associate_entry(void *entry, struct address_space *mapping,
35762306a36Sopenharmony_ci		struct vm_area_struct *vma, unsigned long address, bool shared)
35862306a36Sopenharmony_ci{
35962306a36Sopenharmony_ci	unsigned long size = dax_entry_size(entry), pfn, index;
36062306a36Sopenharmony_ci	int i = 0;
36162306a36Sopenharmony_ci
36262306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
36362306a36Sopenharmony_ci		return;
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci	index = linear_page_index(vma, address & ~(size - 1));
36662306a36Sopenharmony_ci	for_each_mapped_pfn(entry, pfn) {
36762306a36Sopenharmony_ci		struct page *page = pfn_to_page(pfn);
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci		if (shared) {
37062306a36Sopenharmony_ci			dax_page_share_get(page);
37162306a36Sopenharmony_ci		} else {
37262306a36Sopenharmony_ci			WARN_ON_ONCE(page->mapping);
37362306a36Sopenharmony_ci			page->mapping = mapping;
37462306a36Sopenharmony_ci			page->index = index + i++;
37562306a36Sopenharmony_ci		}
37662306a36Sopenharmony_ci	}
37762306a36Sopenharmony_ci}
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_cistatic void dax_disassociate_entry(void *entry, struct address_space *mapping,
38062306a36Sopenharmony_ci		bool trunc)
38162306a36Sopenharmony_ci{
38262306a36Sopenharmony_ci	unsigned long pfn;
38362306a36Sopenharmony_ci
38462306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
38562306a36Sopenharmony_ci		return;
38662306a36Sopenharmony_ci
38762306a36Sopenharmony_ci	for_each_mapped_pfn(entry, pfn) {
38862306a36Sopenharmony_ci		struct page *page = pfn_to_page(pfn);
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_ci		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
39162306a36Sopenharmony_ci		if (dax_page_is_shared(page)) {
39262306a36Sopenharmony_ci			/* keep the shared flag if this page is still shared */
39362306a36Sopenharmony_ci			if (dax_page_share_put(page) > 0)
39462306a36Sopenharmony_ci				continue;
39562306a36Sopenharmony_ci		} else
39662306a36Sopenharmony_ci			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
39762306a36Sopenharmony_ci		page->mapping = NULL;
39862306a36Sopenharmony_ci		page->index = 0;
39962306a36Sopenharmony_ci	}
40062306a36Sopenharmony_ci}
40162306a36Sopenharmony_ci
40262306a36Sopenharmony_cistatic struct page *dax_busy_page(void *entry)
40362306a36Sopenharmony_ci{
40462306a36Sopenharmony_ci	unsigned long pfn;
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci	for_each_mapped_pfn(entry, pfn) {
40762306a36Sopenharmony_ci		struct page *page = pfn_to_page(pfn);
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci		if (page_ref_count(page) > 1)
41062306a36Sopenharmony_ci			return page;
41162306a36Sopenharmony_ci	}
41262306a36Sopenharmony_ci	return NULL;
41362306a36Sopenharmony_ci}
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci/**
41662306a36Sopenharmony_ci * dax_lock_folio - Lock the DAX entry corresponding to a folio
41762306a36Sopenharmony_ci * @folio: The folio whose entry we want to lock
41862306a36Sopenharmony_ci *
41962306a36Sopenharmony_ci * Context: Process context.
42062306a36Sopenharmony_ci * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
42162306a36Sopenharmony_ci * not be locked.
42262306a36Sopenharmony_ci */
42362306a36Sopenharmony_cidax_entry_t dax_lock_folio(struct folio *folio)
42462306a36Sopenharmony_ci{
42562306a36Sopenharmony_ci	XA_STATE(xas, NULL, 0);
42662306a36Sopenharmony_ci	void *entry;
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci	/* Ensure folio->mapping isn't freed while we look at it */
42962306a36Sopenharmony_ci	rcu_read_lock();
43062306a36Sopenharmony_ci	for (;;) {
43162306a36Sopenharmony_ci		struct address_space *mapping = READ_ONCE(folio->mapping);
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci		entry = NULL;
43462306a36Sopenharmony_ci		if (!mapping || !dax_mapping(mapping))
43562306a36Sopenharmony_ci			break;
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci		/*
43862306a36Sopenharmony_ci		 * In the device-dax case there's no need to lock, a
43962306a36Sopenharmony_ci		 * struct dev_pagemap pin is sufficient to keep the
44062306a36Sopenharmony_ci		 * inode alive, and we assume we have dev_pagemap pin
44162306a36Sopenharmony_ci		 * otherwise we would not have a valid pfn_to_page()
44262306a36Sopenharmony_ci		 * translation.
44362306a36Sopenharmony_ci		 */
44462306a36Sopenharmony_ci		entry = (void *)~0UL;
44562306a36Sopenharmony_ci		if (S_ISCHR(mapping->host->i_mode))
44662306a36Sopenharmony_ci			break;
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci		xas.xa = &mapping->i_pages;
44962306a36Sopenharmony_ci		xas_lock_irq(&xas);
45062306a36Sopenharmony_ci		if (mapping != folio->mapping) {
45162306a36Sopenharmony_ci			xas_unlock_irq(&xas);
45262306a36Sopenharmony_ci			continue;
45362306a36Sopenharmony_ci		}
45462306a36Sopenharmony_ci		xas_set(&xas, folio->index);
45562306a36Sopenharmony_ci		entry = xas_load(&xas);
45662306a36Sopenharmony_ci		if (dax_is_locked(entry)) {
45762306a36Sopenharmony_ci			rcu_read_unlock();
45862306a36Sopenharmony_ci			wait_entry_unlocked(&xas, entry);
45962306a36Sopenharmony_ci			rcu_read_lock();
46062306a36Sopenharmony_ci			continue;
46162306a36Sopenharmony_ci		}
46262306a36Sopenharmony_ci		dax_lock_entry(&xas, entry);
46362306a36Sopenharmony_ci		xas_unlock_irq(&xas);
46462306a36Sopenharmony_ci		break;
46562306a36Sopenharmony_ci	}
46662306a36Sopenharmony_ci	rcu_read_unlock();
46762306a36Sopenharmony_ci	return (dax_entry_t)entry;
46862306a36Sopenharmony_ci}
46962306a36Sopenharmony_ci
47062306a36Sopenharmony_civoid dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
47162306a36Sopenharmony_ci{
47262306a36Sopenharmony_ci	struct address_space *mapping = folio->mapping;
47362306a36Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, folio->index);
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	if (S_ISCHR(mapping->host->i_mode))
47662306a36Sopenharmony_ci		return;
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_ci	dax_unlock_entry(&xas, (void *)cookie);
47962306a36Sopenharmony_ci}
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci/*
48262306a36Sopenharmony_ci * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
48362306a36Sopenharmony_ci * @mapping: the file's mapping whose entry we want to lock
48462306a36Sopenharmony_ci * @index: the offset within this file
48562306a36Sopenharmony_ci * @page: output the dax page corresponding to this dax entry
48662306a36Sopenharmony_ci *
48762306a36Sopenharmony_ci * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
48862306a36Sopenharmony_ci * could not be locked.
48962306a36Sopenharmony_ci */
49062306a36Sopenharmony_cidax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
49162306a36Sopenharmony_ci		struct page **page)
49262306a36Sopenharmony_ci{
49362306a36Sopenharmony_ci	XA_STATE(xas, NULL, 0);
49462306a36Sopenharmony_ci	void *entry;
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_ci	rcu_read_lock();
49762306a36Sopenharmony_ci	for (;;) {
49862306a36Sopenharmony_ci		entry = NULL;
49962306a36Sopenharmony_ci		if (!dax_mapping(mapping))
50062306a36Sopenharmony_ci			break;
50162306a36Sopenharmony_ci
50262306a36Sopenharmony_ci		xas.xa = &mapping->i_pages;
50362306a36Sopenharmony_ci		xas_lock_irq(&xas);
50462306a36Sopenharmony_ci		xas_set(&xas, index);
50562306a36Sopenharmony_ci		entry = xas_load(&xas);
50662306a36Sopenharmony_ci		if (dax_is_locked(entry)) {
50762306a36Sopenharmony_ci			rcu_read_unlock();
50862306a36Sopenharmony_ci			wait_entry_unlocked(&xas, entry);
50962306a36Sopenharmony_ci			rcu_read_lock();
51062306a36Sopenharmony_ci			continue;
51162306a36Sopenharmony_ci		}
51262306a36Sopenharmony_ci		if (!entry ||
51362306a36Sopenharmony_ci		    dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
51462306a36Sopenharmony_ci			/*
51562306a36Sopenharmony_ci			 * Because we are looking for entry from file's mapping
51662306a36Sopenharmony_ci			 * and index, so the entry may not be inserted for now,
51762306a36Sopenharmony_ci			 * or even a zero/empty entry.  We don't think this is
51862306a36Sopenharmony_ci			 * an error case.  So, return a special value and do
51962306a36Sopenharmony_ci			 * not output @page.
52062306a36Sopenharmony_ci			 */
52162306a36Sopenharmony_ci			entry = (void *)~0UL;
52262306a36Sopenharmony_ci		} else {
52362306a36Sopenharmony_ci			*page = pfn_to_page(dax_to_pfn(entry));
52462306a36Sopenharmony_ci			dax_lock_entry(&xas, entry);
52562306a36Sopenharmony_ci		}
52662306a36Sopenharmony_ci		xas_unlock_irq(&xas);
52762306a36Sopenharmony_ci		break;
52862306a36Sopenharmony_ci	}
52962306a36Sopenharmony_ci	rcu_read_unlock();
53062306a36Sopenharmony_ci	return (dax_entry_t)entry;
53162306a36Sopenharmony_ci}
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_civoid dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
53462306a36Sopenharmony_ci		dax_entry_t cookie)
53562306a36Sopenharmony_ci{
53662306a36Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, index);
53762306a36Sopenharmony_ci
53862306a36Sopenharmony_ci	if (cookie == ~0UL)
53962306a36Sopenharmony_ci		return;
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci	dax_unlock_entry(&xas, (void *)cookie);
54262306a36Sopenharmony_ci}
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci/*
54562306a36Sopenharmony_ci * Find page cache entry at given index. If it is a DAX entry, return it
54662306a36Sopenharmony_ci * with the entry locked. If the page cache doesn't contain an entry at
54762306a36Sopenharmony_ci * that index, add a locked empty entry.
54862306a36Sopenharmony_ci *
54962306a36Sopenharmony_ci * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
55062306a36Sopenharmony_ci * either return that locked entry or will return VM_FAULT_FALLBACK.
55162306a36Sopenharmony_ci * This will happen if there are any PTE entries within the PMD range
55262306a36Sopenharmony_ci * that we are requesting.
55362306a36Sopenharmony_ci *
55462306a36Sopenharmony_ci * We always favor PTE entries over PMD entries. There isn't a flow where we
55562306a36Sopenharmony_ci * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
55662306a36Sopenharmony_ci * insertion will fail if it finds any PTE entries already in the tree, and a
55762306a36Sopenharmony_ci * PTE insertion will cause an existing PMD entry to be unmapped and
55862306a36Sopenharmony_ci * downgraded to PTE entries.  This happens for both PMD zero pages as
55962306a36Sopenharmony_ci * well as PMD empty entries.
56062306a36Sopenharmony_ci *
56162306a36Sopenharmony_ci * The exception to this downgrade path is for PMD entries that have
56262306a36Sopenharmony_ci * real storage backing them.  We will leave these real PMD entries in
56362306a36Sopenharmony_ci * the tree, and PTE writes will simply dirty the entire PMD entry.
56462306a36Sopenharmony_ci *
56562306a36Sopenharmony_ci * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
56662306a36Sopenharmony_ci * persistent memory the benefit is doubtful. We can add that later if we can
56762306a36Sopenharmony_ci * show it helps.
56862306a36Sopenharmony_ci *
56962306a36Sopenharmony_ci * On error, this function does not return an ERR_PTR.  Instead it returns
57062306a36Sopenharmony_ci * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
57162306a36Sopenharmony_ci * overlap with xarray value entries.
57262306a36Sopenharmony_ci */
57362306a36Sopenharmony_cistatic void *grab_mapping_entry(struct xa_state *xas,
57462306a36Sopenharmony_ci		struct address_space *mapping, unsigned int order)
57562306a36Sopenharmony_ci{
57662306a36Sopenharmony_ci	unsigned long index = xas->xa_index;
57762306a36Sopenharmony_ci	bool pmd_downgrade;	/* splitting PMD entry into PTE entries? */
57862306a36Sopenharmony_ci	void *entry;
57962306a36Sopenharmony_ci
58062306a36Sopenharmony_ciretry:
58162306a36Sopenharmony_ci	pmd_downgrade = false;
58262306a36Sopenharmony_ci	xas_lock_irq(xas);
58362306a36Sopenharmony_ci	entry = get_unlocked_entry(xas, order);
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci	if (entry) {
58662306a36Sopenharmony_ci		if (dax_is_conflict(entry))
58762306a36Sopenharmony_ci			goto fallback;
58862306a36Sopenharmony_ci		if (!xa_is_value(entry)) {
58962306a36Sopenharmony_ci			xas_set_err(xas, -EIO);
59062306a36Sopenharmony_ci			goto out_unlock;
59162306a36Sopenharmony_ci		}
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_ci		if (order == 0) {
59462306a36Sopenharmony_ci			if (dax_is_pmd_entry(entry) &&
59562306a36Sopenharmony_ci			    (dax_is_zero_entry(entry) ||
59662306a36Sopenharmony_ci			     dax_is_empty_entry(entry))) {
59762306a36Sopenharmony_ci				pmd_downgrade = true;
59862306a36Sopenharmony_ci			}
59962306a36Sopenharmony_ci		}
60062306a36Sopenharmony_ci	}
60162306a36Sopenharmony_ci
60262306a36Sopenharmony_ci	if (pmd_downgrade) {
60362306a36Sopenharmony_ci		/*
60462306a36Sopenharmony_ci		 * Make sure 'entry' remains valid while we drop
60562306a36Sopenharmony_ci		 * the i_pages lock.
60662306a36Sopenharmony_ci		 */
60762306a36Sopenharmony_ci		dax_lock_entry(xas, entry);
60862306a36Sopenharmony_ci
60962306a36Sopenharmony_ci		/*
61062306a36Sopenharmony_ci		 * Besides huge zero pages the only other thing that gets
61162306a36Sopenharmony_ci		 * downgraded are empty entries which don't need to be
61262306a36Sopenharmony_ci		 * unmapped.
61362306a36Sopenharmony_ci		 */
61462306a36Sopenharmony_ci		if (dax_is_zero_entry(entry)) {
61562306a36Sopenharmony_ci			xas_unlock_irq(xas);
61662306a36Sopenharmony_ci			unmap_mapping_pages(mapping,
61762306a36Sopenharmony_ci					xas->xa_index & ~PG_PMD_COLOUR,
61862306a36Sopenharmony_ci					PG_PMD_NR, false);
61962306a36Sopenharmony_ci			xas_reset(xas);
62062306a36Sopenharmony_ci			xas_lock_irq(xas);
62162306a36Sopenharmony_ci		}
62262306a36Sopenharmony_ci
62362306a36Sopenharmony_ci		dax_disassociate_entry(entry, mapping, false);
62462306a36Sopenharmony_ci		xas_store(xas, NULL);	/* undo the PMD join */
62562306a36Sopenharmony_ci		dax_wake_entry(xas, entry, WAKE_ALL);
62662306a36Sopenharmony_ci		mapping->nrpages -= PG_PMD_NR;
62762306a36Sopenharmony_ci		entry = NULL;
62862306a36Sopenharmony_ci		xas_set(xas, index);
62962306a36Sopenharmony_ci	}
63062306a36Sopenharmony_ci
63162306a36Sopenharmony_ci	if (entry) {
63262306a36Sopenharmony_ci		dax_lock_entry(xas, entry);
63362306a36Sopenharmony_ci	} else {
63462306a36Sopenharmony_ci		unsigned long flags = DAX_EMPTY;
63562306a36Sopenharmony_ci
63662306a36Sopenharmony_ci		if (order > 0)
63762306a36Sopenharmony_ci			flags |= DAX_PMD;
63862306a36Sopenharmony_ci		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
63962306a36Sopenharmony_ci		dax_lock_entry(xas, entry);
64062306a36Sopenharmony_ci		if (xas_error(xas))
64162306a36Sopenharmony_ci			goto out_unlock;
64262306a36Sopenharmony_ci		mapping->nrpages += 1UL << order;
64362306a36Sopenharmony_ci	}
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ciout_unlock:
64662306a36Sopenharmony_ci	xas_unlock_irq(xas);
64762306a36Sopenharmony_ci	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
64862306a36Sopenharmony_ci		goto retry;
64962306a36Sopenharmony_ci	if (xas->xa_node == XA_ERROR(-ENOMEM))
65062306a36Sopenharmony_ci		return xa_mk_internal(VM_FAULT_OOM);
65162306a36Sopenharmony_ci	if (xas_error(xas))
65262306a36Sopenharmony_ci		return xa_mk_internal(VM_FAULT_SIGBUS);
65362306a36Sopenharmony_ci	return entry;
65462306a36Sopenharmony_cifallback:
65562306a36Sopenharmony_ci	xas_unlock_irq(xas);
65662306a36Sopenharmony_ci	return xa_mk_internal(VM_FAULT_FALLBACK);
65762306a36Sopenharmony_ci}
65862306a36Sopenharmony_ci
65962306a36Sopenharmony_ci/**
66062306a36Sopenharmony_ci * dax_layout_busy_page_range - find first pinned page in @mapping
66162306a36Sopenharmony_ci * @mapping: address space to scan for a page with ref count > 1
66262306a36Sopenharmony_ci * @start: Starting offset. Page containing 'start' is included.
66362306a36Sopenharmony_ci * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
66462306a36Sopenharmony_ci *       pages from 'start' till the end of file are included.
66562306a36Sopenharmony_ci *
66662306a36Sopenharmony_ci * DAX requires ZONE_DEVICE mapped pages. These pages are never
66762306a36Sopenharmony_ci * 'onlined' to the page allocator so they are considered idle when
66862306a36Sopenharmony_ci * page->count == 1. A filesystem uses this interface to determine if
66962306a36Sopenharmony_ci * any page in the mapping is busy, i.e. for DMA, or other
67062306a36Sopenharmony_ci * get_user_pages() usages.
67162306a36Sopenharmony_ci *
67262306a36Sopenharmony_ci * It is expected that the filesystem is holding locks to block the
67362306a36Sopenharmony_ci * establishment of new mappings in this address_space. I.e. it expects
67462306a36Sopenharmony_ci * to be able to run unmap_mapping_range() and subsequently not race
67562306a36Sopenharmony_ci * mapping_mapped() becoming true.
67662306a36Sopenharmony_ci */
67762306a36Sopenharmony_cistruct page *dax_layout_busy_page_range(struct address_space *mapping,
67862306a36Sopenharmony_ci					loff_t start, loff_t end)
67962306a36Sopenharmony_ci{
68062306a36Sopenharmony_ci	void *entry;
68162306a36Sopenharmony_ci	unsigned int scanned = 0;
68262306a36Sopenharmony_ci	struct page *page = NULL;
68362306a36Sopenharmony_ci	pgoff_t start_idx = start >> PAGE_SHIFT;
68462306a36Sopenharmony_ci	pgoff_t end_idx;
68562306a36Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, start_idx);
68662306a36Sopenharmony_ci
68762306a36Sopenharmony_ci	/*
68862306a36Sopenharmony_ci	 * In the 'limited' case get_user_pages() for dax is disabled.
68962306a36Sopenharmony_ci	 */
69062306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
69162306a36Sopenharmony_ci		return NULL;
69262306a36Sopenharmony_ci
69362306a36Sopenharmony_ci	if (!dax_mapping(mapping) || !mapping_mapped(mapping))
69462306a36Sopenharmony_ci		return NULL;
69562306a36Sopenharmony_ci
69662306a36Sopenharmony_ci	/* If end == LLONG_MAX, all pages from start to till end of file */
69762306a36Sopenharmony_ci	if (end == LLONG_MAX)
69862306a36Sopenharmony_ci		end_idx = ULONG_MAX;
69962306a36Sopenharmony_ci	else
70062306a36Sopenharmony_ci		end_idx = end >> PAGE_SHIFT;
70162306a36Sopenharmony_ci	/*
70262306a36Sopenharmony_ci	 * If we race get_user_pages_fast() here either we'll see the
70362306a36Sopenharmony_ci	 * elevated page count in the iteration and wait, or
70462306a36Sopenharmony_ci	 * get_user_pages_fast() will see that the page it took a reference
70562306a36Sopenharmony_ci	 * against is no longer mapped in the page tables and bail to the
70662306a36Sopenharmony_ci	 * get_user_pages() slow path.  The slow path is protected by
70762306a36Sopenharmony_ci	 * pte_lock() and pmd_lock(). New references are not taken without
70862306a36Sopenharmony_ci	 * holding those locks, and unmap_mapping_pages() will not zero the
70962306a36Sopenharmony_ci	 * pte or pmd without holding the respective lock, so we are
71062306a36Sopenharmony_ci	 * guaranteed to either see new references or prevent new
71162306a36Sopenharmony_ci	 * references from being established.
71262306a36Sopenharmony_ci	 */
71362306a36Sopenharmony_ci	unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
71462306a36Sopenharmony_ci
71562306a36Sopenharmony_ci	xas_lock_irq(&xas);
71662306a36Sopenharmony_ci	xas_for_each(&xas, entry, end_idx) {
71762306a36Sopenharmony_ci		if (WARN_ON_ONCE(!xa_is_value(entry)))
71862306a36Sopenharmony_ci			continue;
71962306a36Sopenharmony_ci		if (unlikely(dax_is_locked(entry)))
72062306a36Sopenharmony_ci			entry = get_unlocked_entry(&xas, 0);
72162306a36Sopenharmony_ci		if (entry)
72262306a36Sopenharmony_ci			page = dax_busy_page(entry);
72362306a36Sopenharmony_ci		put_unlocked_entry(&xas, entry, WAKE_NEXT);
72462306a36Sopenharmony_ci		if (page)
72562306a36Sopenharmony_ci			break;
72662306a36Sopenharmony_ci		if (++scanned % XA_CHECK_SCHED)
72762306a36Sopenharmony_ci			continue;
72862306a36Sopenharmony_ci
72962306a36Sopenharmony_ci		xas_pause(&xas);
73062306a36Sopenharmony_ci		xas_unlock_irq(&xas);
73162306a36Sopenharmony_ci		cond_resched();
73262306a36Sopenharmony_ci		xas_lock_irq(&xas);
73362306a36Sopenharmony_ci	}
73462306a36Sopenharmony_ci	xas_unlock_irq(&xas);
73562306a36Sopenharmony_ci	return page;
73662306a36Sopenharmony_ci}
73762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
73862306a36Sopenharmony_ci
73962306a36Sopenharmony_cistruct page *dax_layout_busy_page(struct address_space *mapping)
74062306a36Sopenharmony_ci{
74162306a36Sopenharmony_ci	return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
74262306a36Sopenharmony_ci}
74362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_layout_busy_page);
74462306a36Sopenharmony_ci
74562306a36Sopenharmony_cistatic int __dax_invalidate_entry(struct address_space *mapping,
74662306a36Sopenharmony_ci					  pgoff_t index, bool trunc)
74762306a36Sopenharmony_ci{
74862306a36Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, index);
74962306a36Sopenharmony_ci	int ret = 0;
75062306a36Sopenharmony_ci	void *entry;
75162306a36Sopenharmony_ci
75262306a36Sopenharmony_ci	xas_lock_irq(&xas);
75362306a36Sopenharmony_ci	entry = get_unlocked_entry(&xas, 0);
75462306a36Sopenharmony_ci	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
75562306a36Sopenharmony_ci		goto out;
75662306a36Sopenharmony_ci	if (!trunc &&
75762306a36Sopenharmony_ci	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
75862306a36Sopenharmony_ci	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
75962306a36Sopenharmony_ci		goto out;
76062306a36Sopenharmony_ci	dax_disassociate_entry(entry, mapping, trunc);
76162306a36Sopenharmony_ci	xas_store(&xas, NULL);
76262306a36Sopenharmony_ci	mapping->nrpages -= 1UL << dax_entry_order(entry);
76362306a36Sopenharmony_ci	ret = 1;
76462306a36Sopenharmony_ciout:
76562306a36Sopenharmony_ci	put_unlocked_entry(&xas, entry, WAKE_ALL);
76662306a36Sopenharmony_ci	xas_unlock_irq(&xas);
76762306a36Sopenharmony_ci	return ret;
76862306a36Sopenharmony_ci}
76962306a36Sopenharmony_ci
77062306a36Sopenharmony_cistatic int __dax_clear_dirty_range(struct address_space *mapping,
77162306a36Sopenharmony_ci		pgoff_t start, pgoff_t end)
77262306a36Sopenharmony_ci{
77362306a36Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, start);
77462306a36Sopenharmony_ci	unsigned int scanned = 0;
77562306a36Sopenharmony_ci	void *entry;
77662306a36Sopenharmony_ci
77762306a36Sopenharmony_ci	xas_lock_irq(&xas);
77862306a36Sopenharmony_ci	xas_for_each(&xas, entry, end) {
77962306a36Sopenharmony_ci		entry = get_unlocked_entry(&xas, 0);
78062306a36Sopenharmony_ci		xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
78162306a36Sopenharmony_ci		xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
78262306a36Sopenharmony_ci		put_unlocked_entry(&xas, entry, WAKE_NEXT);
78362306a36Sopenharmony_ci
78462306a36Sopenharmony_ci		if (++scanned % XA_CHECK_SCHED)
78562306a36Sopenharmony_ci			continue;
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_ci		xas_pause(&xas);
78862306a36Sopenharmony_ci		xas_unlock_irq(&xas);
78962306a36Sopenharmony_ci		cond_resched();
79062306a36Sopenharmony_ci		xas_lock_irq(&xas);
79162306a36Sopenharmony_ci	}
79262306a36Sopenharmony_ci	xas_unlock_irq(&xas);
79362306a36Sopenharmony_ci
79462306a36Sopenharmony_ci	return 0;
79562306a36Sopenharmony_ci}
79662306a36Sopenharmony_ci
79762306a36Sopenharmony_ci/*
79862306a36Sopenharmony_ci * Delete DAX entry at @index from @mapping.  Wait for it
79962306a36Sopenharmony_ci * to be unlocked before deleting it.
80062306a36Sopenharmony_ci */
80162306a36Sopenharmony_ciint dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
80262306a36Sopenharmony_ci{
80362306a36Sopenharmony_ci	int ret = __dax_invalidate_entry(mapping, index, true);
80462306a36Sopenharmony_ci
80562306a36Sopenharmony_ci	/*
80662306a36Sopenharmony_ci	 * This gets called from truncate / punch_hole path. As such, the caller
80762306a36Sopenharmony_ci	 * must hold locks protecting against concurrent modifications of the
80862306a36Sopenharmony_ci	 * page cache (usually fs-private i_mmap_sem for writing). Since the
80962306a36Sopenharmony_ci	 * caller has seen a DAX entry for this index, we better find it
81062306a36Sopenharmony_ci	 * at that index as well...
81162306a36Sopenharmony_ci	 */
81262306a36Sopenharmony_ci	WARN_ON_ONCE(!ret);
81362306a36Sopenharmony_ci	return ret;
81462306a36Sopenharmony_ci}
81562306a36Sopenharmony_ci
81662306a36Sopenharmony_ci/*
81762306a36Sopenharmony_ci * Invalidate DAX entry if it is clean.
81862306a36Sopenharmony_ci */
81962306a36Sopenharmony_ciint dax_invalidate_mapping_entry_sync(struct address_space *mapping,
82062306a36Sopenharmony_ci				      pgoff_t index)
82162306a36Sopenharmony_ci{
82262306a36Sopenharmony_ci	return __dax_invalidate_entry(mapping, index, false);
82362306a36Sopenharmony_ci}
82462306a36Sopenharmony_ci
82562306a36Sopenharmony_cistatic pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
82662306a36Sopenharmony_ci{
82762306a36Sopenharmony_ci	return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
82862306a36Sopenharmony_ci}
82962306a36Sopenharmony_ci
83062306a36Sopenharmony_cistatic int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
83162306a36Sopenharmony_ci{
83262306a36Sopenharmony_ci	pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
83362306a36Sopenharmony_ci	void *vto, *kaddr;
83462306a36Sopenharmony_ci	long rc;
83562306a36Sopenharmony_ci	int id;
83662306a36Sopenharmony_ci
83762306a36Sopenharmony_ci	id = dax_read_lock();
83862306a36Sopenharmony_ci	rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
83962306a36Sopenharmony_ci				&kaddr, NULL);
84062306a36Sopenharmony_ci	if (rc < 0) {
84162306a36Sopenharmony_ci		dax_read_unlock(id);
84262306a36Sopenharmony_ci		return rc;
84362306a36Sopenharmony_ci	}
84462306a36Sopenharmony_ci	vto = kmap_atomic(vmf->cow_page);
84562306a36Sopenharmony_ci	copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
84662306a36Sopenharmony_ci	kunmap_atomic(vto);
84762306a36Sopenharmony_ci	dax_read_unlock(id);
84862306a36Sopenharmony_ci	return 0;
84962306a36Sopenharmony_ci}
85062306a36Sopenharmony_ci
85162306a36Sopenharmony_ci/*
85262306a36Sopenharmony_ci * MAP_SYNC on a dax mapping guarantees dirty metadata is
85362306a36Sopenharmony_ci * flushed on write-faults (non-cow), but not read-faults.
85462306a36Sopenharmony_ci */
85562306a36Sopenharmony_cistatic bool dax_fault_is_synchronous(const struct iomap_iter *iter,
85662306a36Sopenharmony_ci		struct vm_area_struct *vma)
85762306a36Sopenharmony_ci{
85862306a36Sopenharmony_ci	return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
85962306a36Sopenharmony_ci		(iter->iomap.flags & IOMAP_F_DIRTY);
86062306a36Sopenharmony_ci}
86162306a36Sopenharmony_ci
86262306a36Sopenharmony_ci/*
86362306a36Sopenharmony_ci * By this point grab_mapping_entry() has ensured that we have a locked entry
86462306a36Sopenharmony_ci * of the appropriate size so we don't have to worry about downgrading PMDs to
86562306a36Sopenharmony_ci * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
86662306a36Sopenharmony_ci * already in the tree, we will skip the insertion and just dirty the PMD as
86762306a36Sopenharmony_ci * appropriate.
86862306a36Sopenharmony_ci */
86962306a36Sopenharmony_cistatic void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
87062306a36Sopenharmony_ci		const struct iomap_iter *iter, void *entry, pfn_t pfn,
87162306a36Sopenharmony_ci		unsigned long flags)
87262306a36Sopenharmony_ci{
87362306a36Sopenharmony_ci	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
87462306a36Sopenharmony_ci	void *new_entry = dax_make_entry(pfn, flags);
87562306a36Sopenharmony_ci	bool write = iter->flags & IOMAP_WRITE;
87662306a36Sopenharmony_ci	bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
87762306a36Sopenharmony_ci	bool shared = iter->iomap.flags & IOMAP_F_SHARED;
87862306a36Sopenharmony_ci
87962306a36Sopenharmony_ci	if (dirty)
88062306a36Sopenharmony_ci		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci	if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
88362306a36Sopenharmony_ci		unsigned long index = xas->xa_index;
88462306a36Sopenharmony_ci		/* we are replacing a zero page with block mapping */
88562306a36Sopenharmony_ci		if (dax_is_pmd_entry(entry))
88662306a36Sopenharmony_ci			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
88762306a36Sopenharmony_ci					PG_PMD_NR, false);
88862306a36Sopenharmony_ci		else /* pte entry */
88962306a36Sopenharmony_ci			unmap_mapping_pages(mapping, index, 1, false);
89062306a36Sopenharmony_ci	}
89162306a36Sopenharmony_ci
89262306a36Sopenharmony_ci	xas_reset(xas);
89362306a36Sopenharmony_ci	xas_lock_irq(xas);
89462306a36Sopenharmony_ci	if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
89562306a36Sopenharmony_ci		void *old;
89662306a36Sopenharmony_ci
89762306a36Sopenharmony_ci		dax_disassociate_entry(entry, mapping, false);
89862306a36Sopenharmony_ci		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
89962306a36Sopenharmony_ci				shared);
90062306a36Sopenharmony_ci		/*
90162306a36Sopenharmony_ci		 * Only swap our new entry into the page cache if the current
90262306a36Sopenharmony_ci		 * entry is a zero page or an empty entry.  If a normal PTE or
90362306a36Sopenharmony_ci		 * PMD entry is already in the cache, we leave it alone.  This
90462306a36Sopenharmony_ci		 * means that if we are trying to insert a PTE and the
90562306a36Sopenharmony_ci		 * existing entry is a PMD, we will just leave the PMD in the
90662306a36Sopenharmony_ci		 * tree and dirty it if necessary.
90762306a36Sopenharmony_ci		 */
90862306a36Sopenharmony_ci		old = dax_lock_entry(xas, new_entry);
90962306a36Sopenharmony_ci		WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
91062306a36Sopenharmony_ci					DAX_LOCKED));
91162306a36Sopenharmony_ci		entry = new_entry;
91262306a36Sopenharmony_ci	} else {
91362306a36Sopenharmony_ci		xas_load(xas);	/* Walk the xa_state */
91462306a36Sopenharmony_ci	}
91562306a36Sopenharmony_ci
91662306a36Sopenharmony_ci	if (dirty)
91762306a36Sopenharmony_ci		xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
91862306a36Sopenharmony_ci
91962306a36Sopenharmony_ci	if (write && shared)
92062306a36Sopenharmony_ci		xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
92162306a36Sopenharmony_ci
92262306a36Sopenharmony_ci	xas_unlock_irq(xas);
92362306a36Sopenharmony_ci	return entry;
92462306a36Sopenharmony_ci}
92562306a36Sopenharmony_ci
92662306a36Sopenharmony_cistatic int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
92762306a36Sopenharmony_ci		struct address_space *mapping, void *entry)
92862306a36Sopenharmony_ci{
92962306a36Sopenharmony_ci	unsigned long pfn, index, count, end;
93062306a36Sopenharmony_ci	long ret = 0;
93162306a36Sopenharmony_ci	struct vm_area_struct *vma;
93262306a36Sopenharmony_ci
93362306a36Sopenharmony_ci	/*
93462306a36Sopenharmony_ci	 * A page got tagged dirty in DAX mapping? Something is seriously
93562306a36Sopenharmony_ci	 * wrong.
93662306a36Sopenharmony_ci	 */
93762306a36Sopenharmony_ci	if (WARN_ON(!xa_is_value(entry)))
93862306a36Sopenharmony_ci		return -EIO;
93962306a36Sopenharmony_ci
94062306a36Sopenharmony_ci	if (unlikely(dax_is_locked(entry))) {
94162306a36Sopenharmony_ci		void *old_entry = entry;
94262306a36Sopenharmony_ci
94362306a36Sopenharmony_ci		entry = get_unlocked_entry(xas, 0);
94462306a36Sopenharmony_ci
94562306a36Sopenharmony_ci		/* Entry got punched out / reallocated? */
94662306a36Sopenharmony_ci		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
94762306a36Sopenharmony_ci			goto put_unlocked;
94862306a36Sopenharmony_ci		/*
94962306a36Sopenharmony_ci		 * Entry got reallocated elsewhere? No need to writeback.
95062306a36Sopenharmony_ci		 * We have to compare pfns as we must not bail out due to
95162306a36Sopenharmony_ci		 * difference in lockbit or entry type.
95262306a36Sopenharmony_ci		 */
95362306a36Sopenharmony_ci		if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
95462306a36Sopenharmony_ci			goto put_unlocked;
95562306a36Sopenharmony_ci		if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
95662306a36Sopenharmony_ci					dax_is_zero_entry(entry))) {
95762306a36Sopenharmony_ci			ret = -EIO;
95862306a36Sopenharmony_ci			goto put_unlocked;
95962306a36Sopenharmony_ci		}
96062306a36Sopenharmony_ci
96162306a36Sopenharmony_ci		/* Another fsync thread may have already done this entry */
96262306a36Sopenharmony_ci		if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
96362306a36Sopenharmony_ci			goto put_unlocked;
96462306a36Sopenharmony_ci	}
96562306a36Sopenharmony_ci
96662306a36Sopenharmony_ci	/* Lock the entry to serialize with page faults */
96762306a36Sopenharmony_ci	dax_lock_entry(xas, entry);
96862306a36Sopenharmony_ci
96962306a36Sopenharmony_ci	/*
97062306a36Sopenharmony_ci	 * We can clear the tag now but we have to be careful so that concurrent
97162306a36Sopenharmony_ci	 * dax_writeback_one() calls for the same index cannot finish before we
97262306a36Sopenharmony_ci	 * actually flush the caches. This is achieved as the calls will look
97362306a36Sopenharmony_ci	 * at the entry only under the i_pages lock and once they do that
97462306a36Sopenharmony_ci	 * they will see the entry locked and wait for it to unlock.
97562306a36Sopenharmony_ci	 */
97662306a36Sopenharmony_ci	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
97762306a36Sopenharmony_ci	xas_unlock_irq(xas);
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci	/*
98062306a36Sopenharmony_ci	 * If dax_writeback_mapping_range() was given a wbc->range_start
98162306a36Sopenharmony_ci	 * in the middle of a PMD, the 'index' we use needs to be
98262306a36Sopenharmony_ci	 * aligned to the start of the PMD.
98362306a36Sopenharmony_ci	 * This allows us to flush for PMD_SIZE and not have to worry about
98462306a36Sopenharmony_ci	 * partial PMD writebacks.
98562306a36Sopenharmony_ci	 */
98662306a36Sopenharmony_ci	pfn = dax_to_pfn(entry);
98762306a36Sopenharmony_ci	count = 1UL << dax_entry_order(entry);
98862306a36Sopenharmony_ci	index = xas->xa_index & ~(count - 1);
98962306a36Sopenharmony_ci	end = index + count - 1;
99062306a36Sopenharmony_ci
99162306a36Sopenharmony_ci	/* Walk all mappings of a given index of a file and writeprotect them */
99262306a36Sopenharmony_ci	i_mmap_lock_read(mapping);
99362306a36Sopenharmony_ci	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
99462306a36Sopenharmony_ci		pfn_mkclean_range(pfn, count, index, vma);
99562306a36Sopenharmony_ci		cond_resched();
99662306a36Sopenharmony_ci	}
99762306a36Sopenharmony_ci	i_mmap_unlock_read(mapping);
99862306a36Sopenharmony_ci
99962306a36Sopenharmony_ci	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
100062306a36Sopenharmony_ci	/*
100162306a36Sopenharmony_ci	 * After we have flushed the cache, we can clear the dirty tag. There
100262306a36Sopenharmony_ci	 * cannot be new dirty data in the pfn after the flush has completed as
100362306a36Sopenharmony_ci	 * the pfn mappings are writeprotected and fault waits for mapping
100462306a36Sopenharmony_ci	 * entry lock.
100562306a36Sopenharmony_ci	 */
100662306a36Sopenharmony_ci	xas_reset(xas);
100762306a36Sopenharmony_ci	xas_lock_irq(xas);
100862306a36Sopenharmony_ci	xas_store(xas, entry);
100962306a36Sopenharmony_ci	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
101062306a36Sopenharmony_ci	dax_wake_entry(xas, entry, WAKE_NEXT);
101162306a36Sopenharmony_ci
101262306a36Sopenharmony_ci	trace_dax_writeback_one(mapping->host, index, count);
101362306a36Sopenharmony_ci	return ret;
101462306a36Sopenharmony_ci
101562306a36Sopenharmony_ci put_unlocked:
101662306a36Sopenharmony_ci	put_unlocked_entry(xas, entry, WAKE_NEXT);
101762306a36Sopenharmony_ci	return ret;
101862306a36Sopenharmony_ci}
101962306a36Sopenharmony_ci
102062306a36Sopenharmony_ci/*
102162306a36Sopenharmony_ci * Flush the mapping to the persistent domain within the byte range of [start,
102262306a36Sopenharmony_ci * end]. This is required by data integrity operations to ensure file data is
102362306a36Sopenharmony_ci * on persistent storage prior to completion of the operation.
102462306a36Sopenharmony_ci */
102562306a36Sopenharmony_ciint dax_writeback_mapping_range(struct address_space *mapping,
102662306a36Sopenharmony_ci		struct dax_device *dax_dev, struct writeback_control *wbc)
102762306a36Sopenharmony_ci{
102862306a36Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
102962306a36Sopenharmony_ci	struct inode *inode = mapping->host;
103062306a36Sopenharmony_ci	pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
103162306a36Sopenharmony_ci	void *entry;
103262306a36Sopenharmony_ci	int ret = 0;
103362306a36Sopenharmony_ci	unsigned int scanned = 0;
103462306a36Sopenharmony_ci
103562306a36Sopenharmony_ci	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
103662306a36Sopenharmony_ci		return -EIO;
103762306a36Sopenharmony_ci
103862306a36Sopenharmony_ci	if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
103962306a36Sopenharmony_ci		return 0;
104062306a36Sopenharmony_ci
104162306a36Sopenharmony_ci	trace_dax_writeback_range(inode, xas.xa_index, end_index);
104262306a36Sopenharmony_ci
104362306a36Sopenharmony_ci	tag_pages_for_writeback(mapping, xas.xa_index, end_index);
104462306a36Sopenharmony_ci
104562306a36Sopenharmony_ci	xas_lock_irq(&xas);
104662306a36Sopenharmony_ci	xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
104762306a36Sopenharmony_ci		ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
104862306a36Sopenharmony_ci		if (ret < 0) {
104962306a36Sopenharmony_ci			mapping_set_error(mapping, ret);
105062306a36Sopenharmony_ci			break;
105162306a36Sopenharmony_ci		}
105262306a36Sopenharmony_ci		if (++scanned % XA_CHECK_SCHED)
105362306a36Sopenharmony_ci			continue;
105462306a36Sopenharmony_ci
105562306a36Sopenharmony_ci		xas_pause(&xas);
105662306a36Sopenharmony_ci		xas_unlock_irq(&xas);
105762306a36Sopenharmony_ci		cond_resched();
105862306a36Sopenharmony_ci		xas_lock_irq(&xas);
105962306a36Sopenharmony_ci	}
106062306a36Sopenharmony_ci	xas_unlock_irq(&xas);
106162306a36Sopenharmony_ci	trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
106262306a36Sopenharmony_ci	return ret;
106362306a36Sopenharmony_ci}
106462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
106562306a36Sopenharmony_ci
106662306a36Sopenharmony_cistatic int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
106762306a36Sopenharmony_ci		size_t size, void **kaddr, pfn_t *pfnp)
106862306a36Sopenharmony_ci{
106962306a36Sopenharmony_ci	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
107062306a36Sopenharmony_ci	int id, rc = 0;
107162306a36Sopenharmony_ci	long length;
107262306a36Sopenharmony_ci
107362306a36Sopenharmony_ci	id = dax_read_lock();
107462306a36Sopenharmony_ci	length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
107562306a36Sopenharmony_ci				   DAX_ACCESS, kaddr, pfnp);
107662306a36Sopenharmony_ci	if (length < 0) {
107762306a36Sopenharmony_ci		rc = length;
107862306a36Sopenharmony_ci		goto out;
107962306a36Sopenharmony_ci	}
108062306a36Sopenharmony_ci	if (!pfnp)
108162306a36Sopenharmony_ci		goto out_check_addr;
108262306a36Sopenharmony_ci	rc = -EINVAL;
108362306a36Sopenharmony_ci	if (PFN_PHYS(length) < size)
108462306a36Sopenharmony_ci		goto out;
108562306a36Sopenharmony_ci	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
108662306a36Sopenharmony_ci		goto out;
108762306a36Sopenharmony_ci	/* For larger pages we need devmap */
108862306a36Sopenharmony_ci	if (length > 1 && !pfn_t_devmap(*pfnp))
108962306a36Sopenharmony_ci		goto out;
109062306a36Sopenharmony_ci	rc = 0;
109162306a36Sopenharmony_ci
109262306a36Sopenharmony_ciout_check_addr:
109362306a36Sopenharmony_ci	if (!kaddr)
109462306a36Sopenharmony_ci		goto out;
109562306a36Sopenharmony_ci	if (!*kaddr)
109662306a36Sopenharmony_ci		rc = -EFAULT;
109762306a36Sopenharmony_ciout:
109862306a36Sopenharmony_ci	dax_read_unlock(id);
109962306a36Sopenharmony_ci	return rc;
110062306a36Sopenharmony_ci}
110162306a36Sopenharmony_ci
110262306a36Sopenharmony_ci/**
110362306a36Sopenharmony_ci * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
110462306a36Sopenharmony_ci * by copying the data before and after the range to be written.
110562306a36Sopenharmony_ci * @pos:	address to do copy from.
110662306a36Sopenharmony_ci * @length:	size of copy operation.
110762306a36Sopenharmony_ci * @align_size:	aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
110862306a36Sopenharmony_ci * @srcmap:	iomap srcmap
110962306a36Sopenharmony_ci * @daddr:	destination address to copy to.
111062306a36Sopenharmony_ci *
111162306a36Sopenharmony_ci * This can be called from two places. Either during DAX write fault (page
111262306a36Sopenharmony_ci * aligned), to copy the length size data to daddr. Or, while doing normal DAX
111362306a36Sopenharmony_ci * write operation, dax_iomap_iter() might call this to do the copy of either
111462306a36Sopenharmony_ci * start or end unaligned address. In the latter case the rest of the copy of
111562306a36Sopenharmony_ci * aligned ranges is taken care by dax_iomap_iter() itself.
111662306a36Sopenharmony_ci * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
111762306a36Sopenharmony_ci * area to make sure no old data remains.
111862306a36Sopenharmony_ci */
111962306a36Sopenharmony_cistatic int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
112062306a36Sopenharmony_ci		const struct iomap *srcmap, void *daddr)
112162306a36Sopenharmony_ci{
112262306a36Sopenharmony_ci	loff_t head_off = pos & (align_size - 1);
112362306a36Sopenharmony_ci	size_t size = ALIGN(head_off + length, align_size);
112462306a36Sopenharmony_ci	loff_t end = pos + length;
112562306a36Sopenharmony_ci	loff_t pg_end = round_up(end, align_size);
112662306a36Sopenharmony_ci	/* copy_all is usually in page fault case */
112762306a36Sopenharmony_ci	bool copy_all = head_off == 0 && end == pg_end;
112862306a36Sopenharmony_ci	/* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
112962306a36Sopenharmony_ci	bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
113062306a36Sopenharmony_ci			 srcmap->type == IOMAP_UNWRITTEN;
113162306a36Sopenharmony_ci	void *saddr = 0;
113262306a36Sopenharmony_ci	int ret = 0;
113362306a36Sopenharmony_ci
113462306a36Sopenharmony_ci	if (!zero_edge) {
113562306a36Sopenharmony_ci		ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
113662306a36Sopenharmony_ci		if (ret)
113762306a36Sopenharmony_ci			return dax_mem2blk_err(ret);
113862306a36Sopenharmony_ci	}
113962306a36Sopenharmony_ci
114062306a36Sopenharmony_ci	if (copy_all) {
114162306a36Sopenharmony_ci		if (zero_edge)
114262306a36Sopenharmony_ci			memset(daddr, 0, size);
114362306a36Sopenharmony_ci		else
114462306a36Sopenharmony_ci			ret = copy_mc_to_kernel(daddr, saddr, length);
114562306a36Sopenharmony_ci		goto out;
114662306a36Sopenharmony_ci	}
114762306a36Sopenharmony_ci
114862306a36Sopenharmony_ci	/* Copy the head part of the range */
114962306a36Sopenharmony_ci	if (head_off) {
115062306a36Sopenharmony_ci		if (zero_edge)
115162306a36Sopenharmony_ci			memset(daddr, 0, head_off);
115262306a36Sopenharmony_ci		else {
115362306a36Sopenharmony_ci			ret = copy_mc_to_kernel(daddr, saddr, head_off);
115462306a36Sopenharmony_ci			if (ret)
115562306a36Sopenharmony_ci				return -EIO;
115662306a36Sopenharmony_ci		}
115762306a36Sopenharmony_ci	}
115862306a36Sopenharmony_ci
115962306a36Sopenharmony_ci	/* Copy the tail part of the range */
116062306a36Sopenharmony_ci	if (end < pg_end) {
116162306a36Sopenharmony_ci		loff_t tail_off = head_off + length;
116262306a36Sopenharmony_ci		loff_t tail_len = pg_end - end;
116362306a36Sopenharmony_ci
116462306a36Sopenharmony_ci		if (zero_edge)
116562306a36Sopenharmony_ci			memset(daddr + tail_off, 0, tail_len);
116662306a36Sopenharmony_ci		else {
116762306a36Sopenharmony_ci			ret = copy_mc_to_kernel(daddr + tail_off,
116862306a36Sopenharmony_ci						saddr + tail_off, tail_len);
116962306a36Sopenharmony_ci			if (ret)
117062306a36Sopenharmony_ci				return -EIO;
117162306a36Sopenharmony_ci		}
117262306a36Sopenharmony_ci	}
117362306a36Sopenharmony_ciout:
117462306a36Sopenharmony_ci	if (zero_edge)
117562306a36Sopenharmony_ci		dax_flush(srcmap->dax_dev, daddr, size);
117662306a36Sopenharmony_ci	return ret ? -EIO : 0;
117762306a36Sopenharmony_ci}
117862306a36Sopenharmony_ci
117962306a36Sopenharmony_ci/*
118062306a36Sopenharmony_ci * The user has performed a load from a hole in the file.  Allocating a new
118162306a36Sopenharmony_ci * page in the file would cause excessive storage usage for workloads with
118262306a36Sopenharmony_ci * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
118362306a36Sopenharmony_ci * If this page is ever written to we will re-fault and change the mapping to
118462306a36Sopenharmony_ci * point to real DAX storage instead.
118562306a36Sopenharmony_ci */
118662306a36Sopenharmony_cistatic vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
118762306a36Sopenharmony_ci		const struct iomap_iter *iter, void **entry)
118862306a36Sopenharmony_ci{
118962306a36Sopenharmony_ci	struct inode *inode = iter->inode;
119062306a36Sopenharmony_ci	unsigned long vaddr = vmf->address;
119162306a36Sopenharmony_ci	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
119262306a36Sopenharmony_ci	vm_fault_t ret;
119362306a36Sopenharmony_ci
119462306a36Sopenharmony_ci	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
119562306a36Sopenharmony_ci
119662306a36Sopenharmony_ci	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
119762306a36Sopenharmony_ci	trace_dax_load_hole(inode, vmf, ret);
119862306a36Sopenharmony_ci	return ret;
119962306a36Sopenharmony_ci}
120062306a36Sopenharmony_ci
120162306a36Sopenharmony_ci#ifdef CONFIG_FS_DAX_PMD
120262306a36Sopenharmony_cistatic vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
120362306a36Sopenharmony_ci		const struct iomap_iter *iter, void **entry)
120462306a36Sopenharmony_ci{
120562306a36Sopenharmony_ci	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
120662306a36Sopenharmony_ci	unsigned long pmd_addr = vmf->address & PMD_MASK;
120762306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
120862306a36Sopenharmony_ci	struct inode *inode = mapping->host;
120962306a36Sopenharmony_ci	pgtable_t pgtable = NULL;
121062306a36Sopenharmony_ci	struct page *zero_page;
121162306a36Sopenharmony_ci	spinlock_t *ptl;
121262306a36Sopenharmony_ci	pmd_t pmd_entry;
121362306a36Sopenharmony_ci	pfn_t pfn;
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_ci	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
121662306a36Sopenharmony_ci
121762306a36Sopenharmony_ci	if (unlikely(!zero_page))
121862306a36Sopenharmony_ci		goto fallback;
121962306a36Sopenharmony_ci
122062306a36Sopenharmony_ci	pfn = page_to_pfn_t(zero_page);
122162306a36Sopenharmony_ci	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
122262306a36Sopenharmony_ci				  DAX_PMD | DAX_ZERO_PAGE);
122362306a36Sopenharmony_ci
122462306a36Sopenharmony_ci	if (arch_needs_pgtable_deposit()) {
122562306a36Sopenharmony_ci		pgtable = pte_alloc_one(vma->vm_mm);
122662306a36Sopenharmony_ci		if (!pgtable)
122762306a36Sopenharmony_ci			return VM_FAULT_OOM;
122862306a36Sopenharmony_ci	}
122962306a36Sopenharmony_ci
123062306a36Sopenharmony_ci	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
123162306a36Sopenharmony_ci	if (!pmd_none(*(vmf->pmd))) {
123262306a36Sopenharmony_ci		spin_unlock(ptl);
123362306a36Sopenharmony_ci		goto fallback;
123462306a36Sopenharmony_ci	}
123562306a36Sopenharmony_ci
123662306a36Sopenharmony_ci	if (pgtable) {
123762306a36Sopenharmony_ci		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
123862306a36Sopenharmony_ci		mm_inc_nr_ptes(vma->vm_mm);
123962306a36Sopenharmony_ci	}
124062306a36Sopenharmony_ci	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
124162306a36Sopenharmony_ci	pmd_entry = pmd_mkhuge(pmd_entry);
124262306a36Sopenharmony_ci	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
124362306a36Sopenharmony_ci	spin_unlock(ptl);
124462306a36Sopenharmony_ci	trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
124562306a36Sopenharmony_ci	return VM_FAULT_NOPAGE;
124662306a36Sopenharmony_ci
124762306a36Sopenharmony_cifallback:
124862306a36Sopenharmony_ci	if (pgtable)
124962306a36Sopenharmony_ci		pte_free(vma->vm_mm, pgtable);
125062306a36Sopenharmony_ci	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
125162306a36Sopenharmony_ci	return VM_FAULT_FALLBACK;
125262306a36Sopenharmony_ci}
125362306a36Sopenharmony_ci#else
125462306a36Sopenharmony_cistatic vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
125562306a36Sopenharmony_ci		const struct iomap_iter *iter, void **entry)
125662306a36Sopenharmony_ci{
125762306a36Sopenharmony_ci	return VM_FAULT_FALLBACK;
125862306a36Sopenharmony_ci}
125962306a36Sopenharmony_ci#endif /* CONFIG_FS_DAX_PMD */
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_cistatic s64 dax_unshare_iter(struct iomap_iter *iter)
126262306a36Sopenharmony_ci{
126362306a36Sopenharmony_ci	struct iomap *iomap = &iter->iomap;
126462306a36Sopenharmony_ci	const struct iomap *srcmap = iomap_iter_srcmap(iter);
126562306a36Sopenharmony_ci	loff_t pos = iter->pos;
126662306a36Sopenharmony_ci	loff_t length = iomap_length(iter);
126762306a36Sopenharmony_ci	int id = 0;
126862306a36Sopenharmony_ci	s64 ret = 0;
126962306a36Sopenharmony_ci	void *daddr = NULL, *saddr = NULL;
127062306a36Sopenharmony_ci
127162306a36Sopenharmony_ci	/* don't bother with blocks that are not shared to start with */
127262306a36Sopenharmony_ci	if (!(iomap->flags & IOMAP_F_SHARED))
127362306a36Sopenharmony_ci		return length;
127462306a36Sopenharmony_ci
127562306a36Sopenharmony_ci	id = dax_read_lock();
127662306a36Sopenharmony_ci	ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
127762306a36Sopenharmony_ci	if (ret < 0)
127862306a36Sopenharmony_ci		goto out_unlock;
127962306a36Sopenharmony_ci
128062306a36Sopenharmony_ci	/* zero the distance if srcmap is HOLE or UNWRITTEN */
128162306a36Sopenharmony_ci	if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) {
128262306a36Sopenharmony_ci		memset(daddr, 0, length);
128362306a36Sopenharmony_ci		dax_flush(iomap->dax_dev, daddr, length);
128462306a36Sopenharmony_ci		ret = length;
128562306a36Sopenharmony_ci		goto out_unlock;
128662306a36Sopenharmony_ci	}
128762306a36Sopenharmony_ci
128862306a36Sopenharmony_ci	ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
128962306a36Sopenharmony_ci	if (ret < 0)
129062306a36Sopenharmony_ci		goto out_unlock;
129162306a36Sopenharmony_ci
129262306a36Sopenharmony_ci	if (copy_mc_to_kernel(daddr, saddr, length) == 0)
129362306a36Sopenharmony_ci		ret = length;
129462306a36Sopenharmony_ci	else
129562306a36Sopenharmony_ci		ret = -EIO;
129662306a36Sopenharmony_ci
129762306a36Sopenharmony_ciout_unlock:
129862306a36Sopenharmony_ci	dax_read_unlock(id);
129962306a36Sopenharmony_ci	return dax_mem2blk_err(ret);
130062306a36Sopenharmony_ci}
130162306a36Sopenharmony_ci
130262306a36Sopenharmony_ciint dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
130362306a36Sopenharmony_ci		const struct iomap_ops *ops)
130462306a36Sopenharmony_ci{
130562306a36Sopenharmony_ci	struct iomap_iter iter = {
130662306a36Sopenharmony_ci		.inode		= inode,
130762306a36Sopenharmony_ci		.pos		= pos,
130862306a36Sopenharmony_ci		.len		= len,
130962306a36Sopenharmony_ci		.flags		= IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
131062306a36Sopenharmony_ci	};
131162306a36Sopenharmony_ci	int ret;
131262306a36Sopenharmony_ci
131362306a36Sopenharmony_ci	while ((ret = iomap_iter(&iter, ops)) > 0)
131462306a36Sopenharmony_ci		iter.processed = dax_unshare_iter(&iter);
131562306a36Sopenharmony_ci	return ret;
131662306a36Sopenharmony_ci}
131762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_file_unshare);
131862306a36Sopenharmony_ci
131962306a36Sopenharmony_cistatic int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
132062306a36Sopenharmony_ci{
132162306a36Sopenharmony_ci	const struct iomap *iomap = &iter->iomap;
132262306a36Sopenharmony_ci	const struct iomap *srcmap = iomap_iter_srcmap(iter);
132362306a36Sopenharmony_ci	unsigned offset = offset_in_page(pos);
132462306a36Sopenharmony_ci	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
132562306a36Sopenharmony_ci	void *kaddr;
132662306a36Sopenharmony_ci	long ret;
132762306a36Sopenharmony_ci
132862306a36Sopenharmony_ci	ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
132962306a36Sopenharmony_ci				NULL);
133062306a36Sopenharmony_ci	if (ret < 0)
133162306a36Sopenharmony_ci		return dax_mem2blk_err(ret);
133262306a36Sopenharmony_ci
133362306a36Sopenharmony_ci	memset(kaddr + offset, 0, size);
133462306a36Sopenharmony_ci	if (iomap->flags & IOMAP_F_SHARED)
133562306a36Sopenharmony_ci		ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
133662306a36Sopenharmony_ci					    kaddr);
133762306a36Sopenharmony_ci	else
133862306a36Sopenharmony_ci		dax_flush(iomap->dax_dev, kaddr + offset, size);
133962306a36Sopenharmony_ci	return ret;
134062306a36Sopenharmony_ci}
134162306a36Sopenharmony_ci
134262306a36Sopenharmony_cistatic s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
134362306a36Sopenharmony_ci{
134462306a36Sopenharmony_ci	const struct iomap *iomap = &iter->iomap;
134562306a36Sopenharmony_ci	const struct iomap *srcmap = iomap_iter_srcmap(iter);
134662306a36Sopenharmony_ci	loff_t pos = iter->pos;
134762306a36Sopenharmony_ci	u64 length = iomap_length(iter);
134862306a36Sopenharmony_ci	s64 written = 0;
134962306a36Sopenharmony_ci
135062306a36Sopenharmony_ci	/* already zeroed?  we're done. */
135162306a36Sopenharmony_ci	if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
135262306a36Sopenharmony_ci		return length;
135362306a36Sopenharmony_ci
135462306a36Sopenharmony_ci	/*
135562306a36Sopenharmony_ci	 * invalidate the pages whose sharing state is to be changed
135662306a36Sopenharmony_ci	 * because of CoW.
135762306a36Sopenharmony_ci	 */
135862306a36Sopenharmony_ci	if (iomap->flags & IOMAP_F_SHARED)
135962306a36Sopenharmony_ci		invalidate_inode_pages2_range(iter->inode->i_mapping,
136062306a36Sopenharmony_ci					      pos >> PAGE_SHIFT,
136162306a36Sopenharmony_ci					      (pos + length - 1) >> PAGE_SHIFT);
136262306a36Sopenharmony_ci
136362306a36Sopenharmony_ci	do {
136462306a36Sopenharmony_ci		unsigned offset = offset_in_page(pos);
136562306a36Sopenharmony_ci		unsigned size = min_t(u64, PAGE_SIZE - offset, length);
136662306a36Sopenharmony_ci		pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
136762306a36Sopenharmony_ci		long rc;
136862306a36Sopenharmony_ci		int id;
136962306a36Sopenharmony_ci
137062306a36Sopenharmony_ci		id = dax_read_lock();
137162306a36Sopenharmony_ci		if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
137262306a36Sopenharmony_ci			rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
137362306a36Sopenharmony_ci		else
137462306a36Sopenharmony_ci			rc = dax_memzero(iter, pos, size);
137562306a36Sopenharmony_ci		dax_read_unlock(id);
137662306a36Sopenharmony_ci
137762306a36Sopenharmony_ci		if (rc < 0)
137862306a36Sopenharmony_ci			return rc;
137962306a36Sopenharmony_ci		pos += size;
138062306a36Sopenharmony_ci		length -= size;
138162306a36Sopenharmony_ci		written += size;
138262306a36Sopenharmony_ci	} while (length > 0);
138362306a36Sopenharmony_ci
138462306a36Sopenharmony_ci	if (did_zero)
138562306a36Sopenharmony_ci		*did_zero = true;
138662306a36Sopenharmony_ci	return written;
138762306a36Sopenharmony_ci}
138862306a36Sopenharmony_ci
138962306a36Sopenharmony_ciint dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
139062306a36Sopenharmony_ci		const struct iomap_ops *ops)
139162306a36Sopenharmony_ci{
139262306a36Sopenharmony_ci	struct iomap_iter iter = {
139362306a36Sopenharmony_ci		.inode		= inode,
139462306a36Sopenharmony_ci		.pos		= pos,
139562306a36Sopenharmony_ci		.len		= len,
139662306a36Sopenharmony_ci		.flags		= IOMAP_DAX | IOMAP_ZERO,
139762306a36Sopenharmony_ci	};
139862306a36Sopenharmony_ci	int ret;
139962306a36Sopenharmony_ci
140062306a36Sopenharmony_ci	while ((ret = iomap_iter(&iter, ops)) > 0)
140162306a36Sopenharmony_ci		iter.processed = dax_zero_iter(&iter, did_zero);
140262306a36Sopenharmony_ci	return ret;
140362306a36Sopenharmony_ci}
140462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_zero_range);
140562306a36Sopenharmony_ci
140662306a36Sopenharmony_ciint dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
140762306a36Sopenharmony_ci		const struct iomap_ops *ops)
140862306a36Sopenharmony_ci{
140962306a36Sopenharmony_ci	unsigned int blocksize = i_blocksize(inode);
141062306a36Sopenharmony_ci	unsigned int off = pos & (blocksize - 1);
141162306a36Sopenharmony_ci
141262306a36Sopenharmony_ci	/* Block boundary? Nothing to do */
141362306a36Sopenharmony_ci	if (!off)
141462306a36Sopenharmony_ci		return 0;
141562306a36Sopenharmony_ci	return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
141662306a36Sopenharmony_ci}
141762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_truncate_page);
141862306a36Sopenharmony_ci
141962306a36Sopenharmony_cistatic loff_t dax_iomap_iter(const struct iomap_iter *iomi,
142062306a36Sopenharmony_ci		struct iov_iter *iter)
142162306a36Sopenharmony_ci{
142262306a36Sopenharmony_ci	const struct iomap *iomap = &iomi->iomap;
142362306a36Sopenharmony_ci	const struct iomap *srcmap = iomap_iter_srcmap(iomi);
142462306a36Sopenharmony_ci	loff_t length = iomap_length(iomi);
142562306a36Sopenharmony_ci	loff_t pos = iomi->pos;
142662306a36Sopenharmony_ci	struct dax_device *dax_dev = iomap->dax_dev;
142762306a36Sopenharmony_ci	loff_t end = pos + length, done = 0;
142862306a36Sopenharmony_ci	bool write = iov_iter_rw(iter) == WRITE;
142962306a36Sopenharmony_ci	bool cow = write && iomap->flags & IOMAP_F_SHARED;
143062306a36Sopenharmony_ci	ssize_t ret = 0;
143162306a36Sopenharmony_ci	size_t xfer;
143262306a36Sopenharmony_ci	int id;
143362306a36Sopenharmony_ci
143462306a36Sopenharmony_ci	if (!write) {
143562306a36Sopenharmony_ci		end = min(end, i_size_read(iomi->inode));
143662306a36Sopenharmony_ci		if (pos >= end)
143762306a36Sopenharmony_ci			return 0;
143862306a36Sopenharmony_ci
143962306a36Sopenharmony_ci		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
144062306a36Sopenharmony_ci			return iov_iter_zero(min(length, end - pos), iter);
144162306a36Sopenharmony_ci	}
144262306a36Sopenharmony_ci
144362306a36Sopenharmony_ci	/*
144462306a36Sopenharmony_ci	 * In DAX mode, enforce either pure overwrites of written extents, or
144562306a36Sopenharmony_ci	 * writes to unwritten extents as part of a copy-on-write operation.
144662306a36Sopenharmony_ci	 */
144762306a36Sopenharmony_ci	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
144862306a36Sopenharmony_ci			!(iomap->flags & IOMAP_F_SHARED)))
144962306a36Sopenharmony_ci		return -EIO;
145062306a36Sopenharmony_ci
145162306a36Sopenharmony_ci	/*
145262306a36Sopenharmony_ci	 * Write can allocate block for an area which has a hole page mapped
145362306a36Sopenharmony_ci	 * into page tables. We have to tear down these mappings so that data
145462306a36Sopenharmony_ci	 * written by write(2) is visible in mmap.
145562306a36Sopenharmony_ci	 */
145662306a36Sopenharmony_ci	if (iomap->flags & IOMAP_F_NEW || cow) {
145762306a36Sopenharmony_ci		/*
145862306a36Sopenharmony_ci		 * Filesystem allows CoW on non-shared extents. The src extents
145962306a36Sopenharmony_ci		 * may have been mmapped with dirty mark before. To be able to
146062306a36Sopenharmony_ci		 * invalidate its dax entries, we need to clear the dirty mark
146162306a36Sopenharmony_ci		 * in advance.
146262306a36Sopenharmony_ci		 */
146362306a36Sopenharmony_ci		if (cow)
146462306a36Sopenharmony_ci			__dax_clear_dirty_range(iomi->inode->i_mapping,
146562306a36Sopenharmony_ci						pos >> PAGE_SHIFT,
146662306a36Sopenharmony_ci						(end - 1) >> PAGE_SHIFT);
146762306a36Sopenharmony_ci		invalidate_inode_pages2_range(iomi->inode->i_mapping,
146862306a36Sopenharmony_ci					      pos >> PAGE_SHIFT,
146962306a36Sopenharmony_ci					      (end - 1) >> PAGE_SHIFT);
147062306a36Sopenharmony_ci	}
147162306a36Sopenharmony_ci
147262306a36Sopenharmony_ci	id = dax_read_lock();
147362306a36Sopenharmony_ci	while (pos < end) {
147462306a36Sopenharmony_ci		unsigned offset = pos & (PAGE_SIZE - 1);
147562306a36Sopenharmony_ci		const size_t size = ALIGN(length + offset, PAGE_SIZE);
147662306a36Sopenharmony_ci		pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
147762306a36Sopenharmony_ci		ssize_t map_len;
147862306a36Sopenharmony_ci		bool recovery = false;
147962306a36Sopenharmony_ci		void *kaddr;
148062306a36Sopenharmony_ci
148162306a36Sopenharmony_ci		if (fatal_signal_pending(current)) {
148262306a36Sopenharmony_ci			ret = -EINTR;
148362306a36Sopenharmony_ci			break;
148462306a36Sopenharmony_ci		}
148562306a36Sopenharmony_ci
148662306a36Sopenharmony_ci		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
148762306a36Sopenharmony_ci				DAX_ACCESS, &kaddr, NULL);
148862306a36Sopenharmony_ci		if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
148962306a36Sopenharmony_ci			map_len = dax_direct_access(dax_dev, pgoff,
149062306a36Sopenharmony_ci					PHYS_PFN(size), DAX_RECOVERY_WRITE,
149162306a36Sopenharmony_ci					&kaddr, NULL);
149262306a36Sopenharmony_ci			if (map_len > 0)
149362306a36Sopenharmony_ci				recovery = true;
149462306a36Sopenharmony_ci		}
149562306a36Sopenharmony_ci		if (map_len < 0) {
149662306a36Sopenharmony_ci			ret = dax_mem2blk_err(map_len);
149762306a36Sopenharmony_ci			break;
149862306a36Sopenharmony_ci		}
149962306a36Sopenharmony_ci
150062306a36Sopenharmony_ci		if (cow) {
150162306a36Sopenharmony_ci			ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
150262306a36Sopenharmony_ci						    srcmap, kaddr);
150362306a36Sopenharmony_ci			if (ret)
150462306a36Sopenharmony_ci				break;
150562306a36Sopenharmony_ci		}
150662306a36Sopenharmony_ci
150762306a36Sopenharmony_ci		map_len = PFN_PHYS(map_len);
150862306a36Sopenharmony_ci		kaddr += offset;
150962306a36Sopenharmony_ci		map_len -= offset;
151062306a36Sopenharmony_ci		if (map_len > end - pos)
151162306a36Sopenharmony_ci			map_len = end - pos;
151262306a36Sopenharmony_ci
151362306a36Sopenharmony_ci		if (recovery)
151462306a36Sopenharmony_ci			xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
151562306a36Sopenharmony_ci					map_len, iter);
151662306a36Sopenharmony_ci		else if (write)
151762306a36Sopenharmony_ci			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
151862306a36Sopenharmony_ci					map_len, iter);
151962306a36Sopenharmony_ci		else
152062306a36Sopenharmony_ci			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
152162306a36Sopenharmony_ci					map_len, iter);
152262306a36Sopenharmony_ci
152362306a36Sopenharmony_ci		pos += xfer;
152462306a36Sopenharmony_ci		length -= xfer;
152562306a36Sopenharmony_ci		done += xfer;
152662306a36Sopenharmony_ci
152762306a36Sopenharmony_ci		if (xfer == 0)
152862306a36Sopenharmony_ci			ret = -EFAULT;
152962306a36Sopenharmony_ci		if (xfer < map_len)
153062306a36Sopenharmony_ci			break;
153162306a36Sopenharmony_ci	}
153262306a36Sopenharmony_ci	dax_read_unlock(id);
153362306a36Sopenharmony_ci
153462306a36Sopenharmony_ci	return done ? done : ret;
153562306a36Sopenharmony_ci}
153662306a36Sopenharmony_ci
153762306a36Sopenharmony_ci/**
153862306a36Sopenharmony_ci * dax_iomap_rw - Perform I/O to a DAX file
153962306a36Sopenharmony_ci * @iocb:	The control block for this I/O
154062306a36Sopenharmony_ci * @iter:	The addresses to do I/O from or to
154162306a36Sopenharmony_ci * @ops:	iomap ops passed from the file system
154262306a36Sopenharmony_ci *
154362306a36Sopenharmony_ci * This function performs read and write operations to directly mapped
154462306a36Sopenharmony_ci * persistent memory.  The callers needs to take care of read/write exclusion
154562306a36Sopenharmony_ci * and evicting any page cache pages in the region under I/O.
154662306a36Sopenharmony_ci */
154762306a36Sopenharmony_cissize_t
154862306a36Sopenharmony_cidax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
154962306a36Sopenharmony_ci		const struct iomap_ops *ops)
155062306a36Sopenharmony_ci{
155162306a36Sopenharmony_ci	struct iomap_iter iomi = {
155262306a36Sopenharmony_ci		.inode		= iocb->ki_filp->f_mapping->host,
155362306a36Sopenharmony_ci		.pos		= iocb->ki_pos,
155462306a36Sopenharmony_ci		.len		= iov_iter_count(iter),
155562306a36Sopenharmony_ci		.flags		= IOMAP_DAX,
155662306a36Sopenharmony_ci	};
155762306a36Sopenharmony_ci	loff_t done = 0;
155862306a36Sopenharmony_ci	int ret;
155962306a36Sopenharmony_ci
156062306a36Sopenharmony_ci	if (!iomi.len)
156162306a36Sopenharmony_ci		return 0;
156262306a36Sopenharmony_ci
156362306a36Sopenharmony_ci	if (iov_iter_rw(iter) == WRITE) {
156462306a36Sopenharmony_ci		lockdep_assert_held_write(&iomi.inode->i_rwsem);
156562306a36Sopenharmony_ci		iomi.flags |= IOMAP_WRITE;
156662306a36Sopenharmony_ci	} else {
156762306a36Sopenharmony_ci		lockdep_assert_held(&iomi.inode->i_rwsem);
156862306a36Sopenharmony_ci	}
156962306a36Sopenharmony_ci
157062306a36Sopenharmony_ci	if (iocb->ki_flags & IOCB_NOWAIT)
157162306a36Sopenharmony_ci		iomi.flags |= IOMAP_NOWAIT;
157262306a36Sopenharmony_ci
157362306a36Sopenharmony_ci	while ((ret = iomap_iter(&iomi, ops)) > 0)
157462306a36Sopenharmony_ci		iomi.processed = dax_iomap_iter(&iomi, iter);
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_ci	done = iomi.pos - iocb->ki_pos;
157762306a36Sopenharmony_ci	iocb->ki_pos = iomi.pos;
157862306a36Sopenharmony_ci	return done ? done : ret;
157962306a36Sopenharmony_ci}
158062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_iomap_rw);
158162306a36Sopenharmony_ci
158262306a36Sopenharmony_cistatic vm_fault_t dax_fault_return(int error)
158362306a36Sopenharmony_ci{
158462306a36Sopenharmony_ci	if (error == 0)
158562306a36Sopenharmony_ci		return VM_FAULT_NOPAGE;
158662306a36Sopenharmony_ci	return vmf_error(error);
158762306a36Sopenharmony_ci}
158862306a36Sopenharmony_ci
158962306a36Sopenharmony_ci/*
159062306a36Sopenharmony_ci * When handling a synchronous page fault and the inode need a fsync, we can
159162306a36Sopenharmony_ci * insert the PTE/PMD into page tables only after that fsync happened. Skip
159262306a36Sopenharmony_ci * insertion for now and return the pfn so that caller can insert it after the
159362306a36Sopenharmony_ci * fsync is done.
159462306a36Sopenharmony_ci */
159562306a36Sopenharmony_cistatic vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
159662306a36Sopenharmony_ci{
159762306a36Sopenharmony_ci	if (WARN_ON_ONCE(!pfnp))
159862306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
159962306a36Sopenharmony_ci	*pfnp = pfn;
160062306a36Sopenharmony_ci	return VM_FAULT_NEEDDSYNC;
160162306a36Sopenharmony_ci}
160262306a36Sopenharmony_ci
160362306a36Sopenharmony_cistatic vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
160462306a36Sopenharmony_ci		const struct iomap_iter *iter)
160562306a36Sopenharmony_ci{
160662306a36Sopenharmony_ci	vm_fault_t ret;
160762306a36Sopenharmony_ci	int error = 0;
160862306a36Sopenharmony_ci
160962306a36Sopenharmony_ci	switch (iter->iomap.type) {
161062306a36Sopenharmony_ci	case IOMAP_HOLE:
161162306a36Sopenharmony_ci	case IOMAP_UNWRITTEN:
161262306a36Sopenharmony_ci		clear_user_highpage(vmf->cow_page, vmf->address);
161362306a36Sopenharmony_ci		break;
161462306a36Sopenharmony_ci	case IOMAP_MAPPED:
161562306a36Sopenharmony_ci		error = copy_cow_page_dax(vmf, iter);
161662306a36Sopenharmony_ci		break;
161762306a36Sopenharmony_ci	default:
161862306a36Sopenharmony_ci		WARN_ON_ONCE(1);
161962306a36Sopenharmony_ci		error = -EIO;
162062306a36Sopenharmony_ci		break;
162162306a36Sopenharmony_ci	}
162262306a36Sopenharmony_ci
162362306a36Sopenharmony_ci	if (error)
162462306a36Sopenharmony_ci		return dax_fault_return(error);
162562306a36Sopenharmony_ci
162662306a36Sopenharmony_ci	__SetPageUptodate(vmf->cow_page);
162762306a36Sopenharmony_ci	ret = finish_fault(vmf);
162862306a36Sopenharmony_ci	if (!ret)
162962306a36Sopenharmony_ci		return VM_FAULT_DONE_COW;
163062306a36Sopenharmony_ci	return ret;
163162306a36Sopenharmony_ci}
163262306a36Sopenharmony_ci
163362306a36Sopenharmony_ci/**
163462306a36Sopenharmony_ci * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
163562306a36Sopenharmony_ci * @vmf:	vm fault instance
163662306a36Sopenharmony_ci * @iter:	iomap iter
163762306a36Sopenharmony_ci * @pfnp:	pfn to be returned
163862306a36Sopenharmony_ci * @xas:	the dax mapping tree of a file
163962306a36Sopenharmony_ci * @entry:	an unlocked dax entry to be inserted
164062306a36Sopenharmony_ci * @pmd:	distinguish whether it is a pmd fault
164162306a36Sopenharmony_ci */
164262306a36Sopenharmony_cistatic vm_fault_t dax_fault_iter(struct vm_fault *vmf,
164362306a36Sopenharmony_ci		const struct iomap_iter *iter, pfn_t *pfnp,
164462306a36Sopenharmony_ci		struct xa_state *xas, void **entry, bool pmd)
164562306a36Sopenharmony_ci{
164662306a36Sopenharmony_ci	const struct iomap *iomap = &iter->iomap;
164762306a36Sopenharmony_ci	const struct iomap *srcmap = iomap_iter_srcmap(iter);
164862306a36Sopenharmony_ci	size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
164962306a36Sopenharmony_ci	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
165062306a36Sopenharmony_ci	bool write = iter->flags & IOMAP_WRITE;
165162306a36Sopenharmony_ci	unsigned long entry_flags = pmd ? DAX_PMD : 0;
165262306a36Sopenharmony_ci	int err = 0;
165362306a36Sopenharmony_ci	pfn_t pfn;
165462306a36Sopenharmony_ci	void *kaddr;
165562306a36Sopenharmony_ci
165662306a36Sopenharmony_ci	if (!pmd && vmf->cow_page)
165762306a36Sopenharmony_ci		return dax_fault_cow_page(vmf, iter);
165862306a36Sopenharmony_ci
165962306a36Sopenharmony_ci	/* if we are reading UNWRITTEN and HOLE, return a hole. */
166062306a36Sopenharmony_ci	if (!write &&
166162306a36Sopenharmony_ci	    (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
166262306a36Sopenharmony_ci		if (!pmd)
166362306a36Sopenharmony_ci			return dax_load_hole(xas, vmf, iter, entry);
166462306a36Sopenharmony_ci		return dax_pmd_load_hole(xas, vmf, iter, entry);
166562306a36Sopenharmony_ci	}
166662306a36Sopenharmony_ci
166762306a36Sopenharmony_ci	if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
166862306a36Sopenharmony_ci		WARN_ON_ONCE(1);
166962306a36Sopenharmony_ci		return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
167062306a36Sopenharmony_ci	}
167162306a36Sopenharmony_ci
167262306a36Sopenharmony_ci	err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
167362306a36Sopenharmony_ci	if (err)
167462306a36Sopenharmony_ci		return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
167562306a36Sopenharmony_ci
167662306a36Sopenharmony_ci	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
167762306a36Sopenharmony_ci
167862306a36Sopenharmony_ci	if (write && iomap->flags & IOMAP_F_SHARED) {
167962306a36Sopenharmony_ci		err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
168062306a36Sopenharmony_ci		if (err)
168162306a36Sopenharmony_ci			return dax_fault_return(err);
168262306a36Sopenharmony_ci	}
168362306a36Sopenharmony_ci
168462306a36Sopenharmony_ci	if (dax_fault_is_synchronous(iter, vmf->vma))
168562306a36Sopenharmony_ci		return dax_fault_synchronous_pfnp(pfnp, pfn);
168662306a36Sopenharmony_ci
168762306a36Sopenharmony_ci	/* insert PMD pfn */
168862306a36Sopenharmony_ci	if (pmd)
168962306a36Sopenharmony_ci		return vmf_insert_pfn_pmd(vmf, pfn, write);
169062306a36Sopenharmony_ci
169162306a36Sopenharmony_ci	/* insert PTE pfn */
169262306a36Sopenharmony_ci	if (write)
169362306a36Sopenharmony_ci		return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
169462306a36Sopenharmony_ci	return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
169562306a36Sopenharmony_ci}
169662306a36Sopenharmony_ci
169762306a36Sopenharmony_cistatic vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
169862306a36Sopenharmony_ci			       int *iomap_errp, const struct iomap_ops *ops)
169962306a36Sopenharmony_ci{
170062306a36Sopenharmony_ci	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
170162306a36Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
170262306a36Sopenharmony_ci	struct iomap_iter iter = {
170362306a36Sopenharmony_ci		.inode		= mapping->host,
170462306a36Sopenharmony_ci		.pos		= (loff_t)vmf->pgoff << PAGE_SHIFT,
170562306a36Sopenharmony_ci		.len		= PAGE_SIZE,
170662306a36Sopenharmony_ci		.flags		= IOMAP_DAX | IOMAP_FAULT,
170762306a36Sopenharmony_ci	};
170862306a36Sopenharmony_ci	vm_fault_t ret = 0;
170962306a36Sopenharmony_ci	void *entry;
171062306a36Sopenharmony_ci	int error;
171162306a36Sopenharmony_ci
171262306a36Sopenharmony_ci	trace_dax_pte_fault(iter.inode, vmf, ret);
171362306a36Sopenharmony_ci	/*
171462306a36Sopenharmony_ci	 * Check whether offset isn't beyond end of file now. Caller is supposed
171562306a36Sopenharmony_ci	 * to hold locks serializing us with truncate / punch hole so this is
171662306a36Sopenharmony_ci	 * a reliable test.
171762306a36Sopenharmony_ci	 */
171862306a36Sopenharmony_ci	if (iter.pos >= i_size_read(iter.inode)) {
171962306a36Sopenharmony_ci		ret = VM_FAULT_SIGBUS;
172062306a36Sopenharmony_ci		goto out;
172162306a36Sopenharmony_ci	}
172262306a36Sopenharmony_ci
172362306a36Sopenharmony_ci	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
172462306a36Sopenharmony_ci		iter.flags |= IOMAP_WRITE;
172562306a36Sopenharmony_ci
172662306a36Sopenharmony_ci	entry = grab_mapping_entry(&xas, mapping, 0);
172762306a36Sopenharmony_ci	if (xa_is_internal(entry)) {
172862306a36Sopenharmony_ci		ret = xa_to_internal(entry);
172962306a36Sopenharmony_ci		goto out;
173062306a36Sopenharmony_ci	}
173162306a36Sopenharmony_ci
173262306a36Sopenharmony_ci	/*
173362306a36Sopenharmony_ci	 * It is possible, particularly with mixed reads & writes to private
173462306a36Sopenharmony_ci	 * mappings, that we have raced with a PMD fault that overlaps with
173562306a36Sopenharmony_ci	 * the PTE we need to set up.  If so just return and the fault will be
173662306a36Sopenharmony_ci	 * retried.
173762306a36Sopenharmony_ci	 */
173862306a36Sopenharmony_ci	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
173962306a36Sopenharmony_ci		ret = VM_FAULT_NOPAGE;
174062306a36Sopenharmony_ci		goto unlock_entry;
174162306a36Sopenharmony_ci	}
174262306a36Sopenharmony_ci
174362306a36Sopenharmony_ci	while ((error = iomap_iter(&iter, ops)) > 0) {
174462306a36Sopenharmony_ci		if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
174562306a36Sopenharmony_ci			iter.processed = -EIO;	/* fs corruption? */
174662306a36Sopenharmony_ci			continue;
174762306a36Sopenharmony_ci		}
174862306a36Sopenharmony_ci
174962306a36Sopenharmony_ci		ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
175062306a36Sopenharmony_ci		if (ret != VM_FAULT_SIGBUS &&
175162306a36Sopenharmony_ci		    (iter.iomap.flags & IOMAP_F_NEW)) {
175262306a36Sopenharmony_ci			count_vm_event(PGMAJFAULT);
175362306a36Sopenharmony_ci			count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
175462306a36Sopenharmony_ci			ret |= VM_FAULT_MAJOR;
175562306a36Sopenharmony_ci		}
175662306a36Sopenharmony_ci
175762306a36Sopenharmony_ci		if (!(ret & VM_FAULT_ERROR))
175862306a36Sopenharmony_ci			iter.processed = PAGE_SIZE;
175962306a36Sopenharmony_ci	}
176062306a36Sopenharmony_ci
176162306a36Sopenharmony_ci	if (iomap_errp)
176262306a36Sopenharmony_ci		*iomap_errp = error;
176362306a36Sopenharmony_ci	if (!ret && error)
176462306a36Sopenharmony_ci		ret = dax_fault_return(error);
176562306a36Sopenharmony_ci
176662306a36Sopenharmony_ciunlock_entry:
176762306a36Sopenharmony_ci	dax_unlock_entry(&xas, entry);
176862306a36Sopenharmony_ciout:
176962306a36Sopenharmony_ci	trace_dax_pte_fault_done(iter.inode, vmf, ret);
177062306a36Sopenharmony_ci	return ret;
177162306a36Sopenharmony_ci}
177262306a36Sopenharmony_ci
177362306a36Sopenharmony_ci#ifdef CONFIG_FS_DAX_PMD
177462306a36Sopenharmony_cistatic bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
177562306a36Sopenharmony_ci		pgoff_t max_pgoff)
177662306a36Sopenharmony_ci{
177762306a36Sopenharmony_ci	unsigned long pmd_addr = vmf->address & PMD_MASK;
177862306a36Sopenharmony_ci	bool write = vmf->flags & FAULT_FLAG_WRITE;
177962306a36Sopenharmony_ci
178062306a36Sopenharmony_ci	/*
178162306a36Sopenharmony_ci	 * Make sure that the faulting address's PMD offset (color) matches
178262306a36Sopenharmony_ci	 * the PMD offset from the start of the file.  This is necessary so
178362306a36Sopenharmony_ci	 * that a PMD range in the page table overlaps exactly with a PMD
178462306a36Sopenharmony_ci	 * range in the page cache.
178562306a36Sopenharmony_ci	 */
178662306a36Sopenharmony_ci	if ((vmf->pgoff & PG_PMD_COLOUR) !=
178762306a36Sopenharmony_ci	    ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
178862306a36Sopenharmony_ci		return true;
178962306a36Sopenharmony_ci
179062306a36Sopenharmony_ci	/* Fall back to PTEs if we're going to COW */
179162306a36Sopenharmony_ci	if (write && !(vmf->vma->vm_flags & VM_SHARED))
179262306a36Sopenharmony_ci		return true;
179362306a36Sopenharmony_ci
179462306a36Sopenharmony_ci	/* If the PMD would extend outside the VMA */
179562306a36Sopenharmony_ci	if (pmd_addr < vmf->vma->vm_start)
179662306a36Sopenharmony_ci		return true;
179762306a36Sopenharmony_ci	if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
179862306a36Sopenharmony_ci		return true;
179962306a36Sopenharmony_ci
180062306a36Sopenharmony_ci	/* If the PMD would extend beyond the file size */
180162306a36Sopenharmony_ci	if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
180262306a36Sopenharmony_ci		return true;
180362306a36Sopenharmony_ci
180462306a36Sopenharmony_ci	return false;
180562306a36Sopenharmony_ci}
180662306a36Sopenharmony_ci
180762306a36Sopenharmony_cistatic vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
180862306a36Sopenharmony_ci			       const struct iomap_ops *ops)
180962306a36Sopenharmony_ci{
181062306a36Sopenharmony_ci	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
181162306a36Sopenharmony_ci	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
181262306a36Sopenharmony_ci	struct iomap_iter iter = {
181362306a36Sopenharmony_ci		.inode		= mapping->host,
181462306a36Sopenharmony_ci		.len		= PMD_SIZE,
181562306a36Sopenharmony_ci		.flags		= IOMAP_DAX | IOMAP_FAULT,
181662306a36Sopenharmony_ci	};
181762306a36Sopenharmony_ci	vm_fault_t ret = VM_FAULT_FALLBACK;
181862306a36Sopenharmony_ci	pgoff_t max_pgoff;
181962306a36Sopenharmony_ci	void *entry;
182062306a36Sopenharmony_ci
182162306a36Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_WRITE)
182262306a36Sopenharmony_ci		iter.flags |= IOMAP_WRITE;
182362306a36Sopenharmony_ci
182462306a36Sopenharmony_ci	/*
182562306a36Sopenharmony_ci	 * Check whether offset isn't beyond end of file now. Caller is
182662306a36Sopenharmony_ci	 * supposed to hold locks serializing us with truncate / punch hole so
182762306a36Sopenharmony_ci	 * this is a reliable test.
182862306a36Sopenharmony_ci	 */
182962306a36Sopenharmony_ci	max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
183062306a36Sopenharmony_ci
183162306a36Sopenharmony_ci	trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
183262306a36Sopenharmony_ci
183362306a36Sopenharmony_ci	if (xas.xa_index >= max_pgoff) {
183462306a36Sopenharmony_ci		ret = VM_FAULT_SIGBUS;
183562306a36Sopenharmony_ci		goto out;
183662306a36Sopenharmony_ci	}
183762306a36Sopenharmony_ci
183862306a36Sopenharmony_ci	if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
183962306a36Sopenharmony_ci		goto fallback;
184062306a36Sopenharmony_ci
184162306a36Sopenharmony_ci	/*
184262306a36Sopenharmony_ci	 * grab_mapping_entry() will make sure we get an empty PMD entry,
184362306a36Sopenharmony_ci	 * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
184462306a36Sopenharmony_ci	 * entry is already in the array, for instance), it will return
184562306a36Sopenharmony_ci	 * VM_FAULT_FALLBACK.
184662306a36Sopenharmony_ci	 */
184762306a36Sopenharmony_ci	entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
184862306a36Sopenharmony_ci	if (xa_is_internal(entry)) {
184962306a36Sopenharmony_ci		ret = xa_to_internal(entry);
185062306a36Sopenharmony_ci		goto fallback;
185162306a36Sopenharmony_ci	}
185262306a36Sopenharmony_ci
185362306a36Sopenharmony_ci	/*
185462306a36Sopenharmony_ci	 * It is possible, particularly with mixed reads & writes to private
185562306a36Sopenharmony_ci	 * mappings, that we have raced with a PTE fault that overlaps with
185662306a36Sopenharmony_ci	 * the PMD we need to set up.  If so just return and the fault will be
185762306a36Sopenharmony_ci	 * retried.
185862306a36Sopenharmony_ci	 */
185962306a36Sopenharmony_ci	if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
186062306a36Sopenharmony_ci			!pmd_devmap(*vmf->pmd)) {
186162306a36Sopenharmony_ci		ret = 0;
186262306a36Sopenharmony_ci		goto unlock_entry;
186362306a36Sopenharmony_ci	}
186462306a36Sopenharmony_ci
186562306a36Sopenharmony_ci	iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
186662306a36Sopenharmony_ci	while (iomap_iter(&iter, ops) > 0) {
186762306a36Sopenharmony_ci		if (iomap_length(&iter) < PMD_SIZE)
186862306a36Sopenharmony_ci			continue; /* actually breaks out of the loop */
186962306a36Sopenharmony_ci
187062306a36Sopenharmony_ci		ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
187162306a36Sopenharmony_ci		if (ret != VM_FAULT_FALLBACK)
187262306a36Sopenharmony_ci			iter.processed = PMD_SIZE;
187362306a36Sopenharmony_ci	}
187462306a36Sopenharmony_ci
187562306a36Sopenharmony_ciunlock_entry:
187662306a36Sopenharmony_ci	dax_unlock_entry(&xas, entry);
187762306a36Sopenharmony_cifallback:
187862306a36Sopenharmony_ci	if (ret == VM_FAULT_FALLBACK) {
187962306a36Sopenharmony_ci		split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
188062306a36Sopenharmony_ci		count_vm_event(THP_FAULT_FALLBACK);
188162306a36Sopenharmony_ci	}
188262306a36Sopenharmony_ciout:
188362306a36Sopenharmony_ci	trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
188462306a36Sopenharmony_ci	return ret;
188562306a36Sopenharmony_ci}
188662306a36Sopenharmony_ci#else
188762306a36Sopenharmony_cistatic vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
188862306a36Sopenharmony_ci			       const struct iomap_ops *ops)
188962306a36Sopenharmony_ci{
189062306a36Sopenharmony_ci	return VM_FAULT_FALLBACK;
189162306a36Sopenharmony_ci}
189262306a36Sopenharmony_ci#endif /* CONFIG_FS_DAX_PMD */
189362306a36Sopenharmony_ci
189462306a36Sopenharmony_ci/**
189562306a36Sopenharmony_ci * dax_iomap_fault - handle a page fault on a DAX file
189662306a36Sopenharmony_ci * @vmf: The description of the fault
189762306a36Sopenharmony_ci * @order: Order of the page to fault in
189862306a36Sopenharmony_ci * @pfnp: PFN to insert for synchronous faults if fsync is required
189962306a36Sopenharmony_ci * @iomap_errp: Storage for detailed error code in case of error
190062306a36Sopenharmony_ci * @ops: Iomap ops passed from the file system
190162306a36Sopenharmony_ci *
190262306a36Sopenharmony_ci * When a page fault occurs, filesystems may call this helper in
190362306a36Sopenharmony_ci * their fault handler for DAX files. dax_iomap_fault() assumes the caller
190462306a36Sopenharmony_ci * has done all the necessary locking for page fault to proceed
190562306a36Sopenharmony_ci * successfully.
190662306a36Sopenharmony_ci */
190762306a36Sopenharmony_civm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
190862306a36Sopenharmony_ci		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
190962306a36Sopenharmony_ci{
191062306a36Sopenharmony_ci	if (order == 0)
191162306a36Sopenharmony_ci		return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
191262306a36Sopenharmony_ci	else if (order == PMD_ORDER)
191362306a36Sopenharmony_ci		return dax_iomap_pmd_fault(vmf, pfnp, ops);
191462306a36Sopenharmony_ci	else
191562306a36Sopenharmony_ci		return VM_FAULT_FALLBACK;
191662306a36Sopenharmony_ci}
191762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_iomap_fault);
191862306a36Sopenharmony_ci
191962306a36Sopenharmony_ci/*
192062306a36Sopenharmony_ci * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
192162306a36Sopenharmony_ci * @vmf: The description of the fault
192262306a36Sopenharmony_ci * @pfn: PFN to insert
192362306a36Sopenharmony_ci * @order: Order of entry to insert.
192462306a36Sopenharmony_ci *
192562306a36Sopenharmony_ci * This function inserts a writeable PTE or PMD entry into the page tables
192662306a36Sopenharmony_ci * for an mmaped DAX file.  It also marks the page cache entry as dirty.
192762306a36Sopenharmony_ci */
192862306a36Sopenharmony_cistatic vm_fault_t
192962306a36Sopenharmony_cidax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
193062306a36Sopenharmony_ci{
193162306a36Sopenharmony_ci	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
193262306a36Sopenharmony_ci	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
193362306a36Sopenharmony_ci	void *entry;
193462306a36Sopenharmony_ci	vm_fault_t ret;
193562306a36Sopenharmony_ci
193662306a36Sopenharmony_ci	xas_lock_irq(&xas);
193762306a36Sopenharmony_ci	entry = get_unlocked_entry(&xas, order);
193862306a36Sopenharmony_ci	/* Did we race with someone splitting entry or so? */
193962306a36Sopenharmony_ci	if (!entry || dax_is_conflict(entry) ||
194062306a36Sopenharmony_ci	    (order == 0 && !dax_is_pte_entry(entry))) {
194162306a36Sopenharmony_ci		put_unlocked_entry(&xas, entry, WAKE_NEXT);
194262306a36Sopenharmony_ci		xas_unlock_irq(&xas);
194362306a36Sopenharmony_ci		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
194462306a36Sopenharmony_ci						      VM_FAULT_NOPAGE);
194562306a36Sopenharmony_ci		return VM_FAULT_NOPAGE;
194662306a36Sopenharmony_ci	}
194762306a36Sopenharmony_ci	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
194862306a36Sopenharmony_ci	dax_lock_entry(&xas, entry);
194962306a36Sopenharmony_ci	xas_unlock_irq(&xas);
195062306a36Sopenharmony_ci	if (order == 0)
195162306a36Sopenharmony_ci		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
195262306a36Sopenharmony_ci#ifdef CONFIG_FS_DAX_PMD
195362306a36Sopenharmony_ci	else if (order == PMD_ORDER)
195462306a36Sopenharmony_ci		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
195562306a36Sopenharmony_ci#endif
195662306a36Sopenharmony_ci	else
195762306a36Sopenharmony_ci		ret = VM_FAULT_FALLBACK;
195862306a36Sopenharmony_ci	dax_unlock_entry(&xas, entry);
195962306a36Sopenharmony_ci	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
196062306a36Sopenharmony_ci	return ret;
196162306a36Sopenharmony_ci}
196262306a36Sopenharmony_ci
196362306a36Sopenharmony_ci/**
196462306a36Sopenharmony_ci * dax_finish_sync_fault - finish synchronous page fault
196562306a36Sopenharmony_ci * @vmf: The description of the fault
196662306a36Sopenharmony_ci * @order: Order of entry to be inserted
196762306a36Sopenharmony_ci * @pfn: PFN to insert
196862306a36Sopenharmony_ci *
196962306a36Sopenharmony_ci * This function ensures that the file range touched by the page fault is
197062306a36Sopenharmony_ci * stored persistently on the media and handles inserting of appropriate page
197162306a36Sopenharmony_ci * table entry.
197262306a36Sopenharmony_ci */
197362306a36Sopenharmony_civm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
197462306a36Sopenharmony_ci		pfn_t pfn)
197562306a36Sopenharmony_ci{
197662306a36Sopenharmony_ci	int err;
197762306a36Sopenharmony_ci	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
197862306a36Sopenharmony_ci	size_t len = PAGE_SIZE << order;
197962306a36Sopenharmony_ci
198062306a36Sopenharmony_ci	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
198162306a36Sopenharmony_ci	if (err)
198262306a36Sopenharmony_ci		return VM_FAULT_SIGBUS;
198362306a36Sopenharmony_ci	return dax_insert_pfn_mkwrite(vmf, pfn, order);
198462306a36Sopenharmony_ci}
198562306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_finish_sync_fault);
198662306a36Sopenharmony_ci
198762306a36Sopenharmony_cistatic loff_t dax_range_compare_iter(struct iomap_iter *it_src,
198862306a36Sopenharmony_ci		struct iomap_iter *it_dest, u64 len, bool *same)
198962306a36Sopenharmony_ci{
199062306a36Sopenharmony_ci	const struct iomap *smap = &it_src->iomap;
199162306a36Sopenharmony_ci	const struct iomap *dmap = &it_dest->iomap;
199262306a36Sopenharmony_ci	loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
199362306a36Sopenharmony_ci	void *saddr, *daddr;
199462306a36Sopenharmony_ci	int id, ret;
199562306a36Sopenharmony_ci
199662306a36Sopenharmony_ci	len = min(len, min(smap->length, dmap->length));
199762306a36Sopenharmony_ci
199862306a36Sopenharmony_ci	if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
199962306a36Sopenharmony_ci		*same = true;
200062306a36Sopenharmony_ci		return len;
200162306a36Sopenharmony_ci	}
200262306a36Sopenharmony_ci
200362306a36Sopenharmony_ci	if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
200462306a36Sopenharmony_ci		*same = false;
200562306a36Sopenharmony_ci		return 0;
200662306a36Sopenharmony_ci	}
200762306a36Sopenharmony_ci
200862306a36Sopenharmony_ci	id = dax_read_lock();
200962306a36Sopenharmony_ci	ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
201062306a36Sopenharmony_ci				      &saddr, NULL);
201162306a36Sopenharmony_ci	if (ret < 0)
201262306a36Sopenharmony_ci		goto out_unlock;
201362306a36Sopenharmony_ci
201462306a36Sopenharmony_ci	ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
201562306a36Sopenharmony_ci				      &daddr, NULL);
201662306a36Sopenharmony_ci	if (ret < 0)
201762306a36Sopenharmony_ci		goto out_unlock;
201862306a36Sopenharmony_ci
201962306a36Sopenharmony_ci	*same = !memcmp(saddr, daddr, len);
202062306a36Sopenharmony_ci	if (!*same)
202162306a36Sopenharmony_ci		len = 0;
202262306a36Sopenharmony_ci	dax_read_unlock(id);
202362306a36Sopenharmony_ci	return len;
202462306a36Sopenharmony_ci
202562306a36Sopenharmony_ciout_unlock:
202662306a36Sopenharmony_ci	dax_read_unlock(id);
202762306a36Sopenharmony_ci	return -EIO;
202862306a36Sopenharmony_ci}
202962306a36Sopenharmony_ci
203062306a36Sopenharmony_ciint dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
203162306a36Sopenharmony_ci		struct inode *dst, loff_t dstoff, loff_t len, bool *same,
203262306a36Sopenharmony_ci		const struct iomap_ops *ops)
203362306a36Sopenharmony_ci{
203462306a36Sopenharmony_ci	struct iomap_iter src_iter = {
203562306a36Sopenharmony_ci		.inode		= src,
203662306a36Sopenharmony_ci		.pos		= srcoff,
203762306a36Sopenharmony_ci		.len		= len,
203862306a36Sopenharmony_ci		.flags		= IOMAP_DAX,
203962306a36Sopenharmony_ci	};
204062306a36Sopenharmony_ci	struct iomap_iter dst_iter = {
204162306a36Sopenharmony_ci		.inode		= dst,
204262306a36Sopenharmony_ci		.pos		= dstoff,
204362306a36Sopenharmony_ci		.len		= len,
204462306a36Sopenharmony_ci		.flags		= IOMAP_DAX,
204562306a36Sopenharmony_ci	};
204662306a36Sopenharmony_ci	int ret, compared = 0;
204762306a36Sopenharmony_ci
204862306a36Sopenharmony_ci	while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
204962306a36Sopenharmony_ci	       (ret = iomap_iter(&dst_iter, ops)) > 0) {
205062306a36Sopenharmony_ci		compared = dax_range_compare_iter(&src_iter, &dst_iter,
205162306a36Sopenharmony_ci				min(src_iter.len, dst_iter.len), same);
205262306a36Sopenharmony_ci		if (compared < 0)
205362306a36Sopenharmony_ci			return ret;
205462306a36Sopenharmony_ci		src_iter.processed = dst_iter.processed = compared;
205562306a36Sopenharmony_ci	}
205662306a36Sopenharmony_ci	return ret;
205762306a36Sopenharmony_ci}
205862306a36Sopenharmony_ci
205962306a36Sopenharmony_ciint dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
206062306a36Sopenharmony_ci			      struct file *file_out, loff_t pos_out,
206162306a36Sopenharmony_ci			      loff_t *len, unsigned int remap_flags,
206262306a36Sopenharmony_ci			      const struct iomap_ops *ops)
206362306a36Sopenharmony_ci{
206462306a36Sopenharmony_ci	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
206562306a36Sopenharmony_ci					       pos_out, len, remap_flags, ops);
206662306a36Sopenharmony_ci}
206762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
2068