162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * fs/dax.c - Direct Access filesystem code 462306a36Sopenharmony_ci * Copyright (c) 2013-2014 Intel Corporation 562306a36Sopenharmony_ci * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 662306a36Sopenharmony_ci * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 762306a36Sopenharmony_ci */ 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#include <linux/atomic.h> 1062306a36Sopenharmony_ci#include <linux/blkdev.h> 1162306a36Sopenharmony_ci#include <linux/buffer_head.h> 1262306a36Sopenharmony_ci#include <linux/dax.h> 1362306a36Sopenharmony_ci#include <linux/fs.h> 1462306a36Sopenharmony_ci#include <linux/highmem.h> 1562306a36Sopenharmony_ci#include <linux/memcontrol.h> 1662306a36Sopenharmony_ci#include <linux/mm.h> 1762306a36Sopenharmony_ci#include <linux/mutex.h> 1862306a36Sopenharmony_ci#include <linux/pagevec.h> 1962306a36Sopenharmony_ci#include <linux/sched.h> 2062306a36Sopenharmony_ci#include <linux/sched/signal.h> 2162306a36Sopenharmony_ci#include <linux/uio.h> 2262306a36Sopenharmony_ci#include <linux/vmstat.h> 2362306a36Sopenharmony_ci#include <linux/pfn_t.h> 2462306a36Sopenharmony_ci#include <linux/sizes.h> 2562306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 2662306a36Sopenharmony_ci#include <linux/iomap.h> 2762306a36Sopenharmony_ci#include <linux/rmap.h> 2862306a36Sopenharmony_ci#include <asm/pgalloc.h> 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_ci#define CREATE_TRACE_POINTS 3162306a36Sopenharmony_ci#include <trace/events/fs_dax.h> 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci/* We choose 4096 entries - same as per-zone page wait tables */ 3462306a36Sopenharmony_ci#define DAX_WAIT_TABLE_BITS 12 3562306a36Sopenharmony_ci#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci/* The 'colour' (ie low bits) within a PMD of a page offset. */ 3862306a36Sopenharmony_ci#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 3962306a36Sopenharmony_ci#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_cistatic wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_cistatic int __init init_dax_wait_table(void) 4462306a36Sopenharmony_ci{ 4562306a36Sopenharmony_ci int i; 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_ci for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 4862306a36Sopenharmony_ci init_waitqueue_head(wait_table + i); 4962306a36Sopenharmony_ci return 0; 5062306a36Sopenharmony_ci} 5162306a36Sopenharmony_cifs_initcall(init_dax_wait_table); 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci/* 5462306a36Sopenharmony_ci * DAX pagecache entries use XArray value entries so they can't be mistaken 5562306a36Sopenharmony_ci * for pages. We use one bit for locking, one bit for the entry size (PMD) 5662306a36Sopenharmony_ci * and two more to tell us if the entry is a zero page or an empty entry that 5762306a36Sopenharmony_ci * is just used for locking. In total four special bits. 5862306a36Sopenharmony_ci * 5962306a36Sopenharmony_ci * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE 6062306a36Sopenharmony_ci * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem 6162306a36Sopenharmony_ci * block allocation. 6262306a36Sopenharmony_ci */ 6362306a36Sopenharmony_ci#define DAX_SHIFT (4) 6462306a36Sopenharmony_ci#define DAX_LOCKED (1UL << 0) 6562306a36Sopenharmony_ci#define DAX_PMD (1UL << 1) 6662306a36Sopenharmony_ci#define DAX_ZERO_PAGE (1UL << 2) 6762306a36Sopenharmony_ci#define DAX_EMPTY (1UL << 3) 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_cistatic unsigned long dax_to_pfn(void *entry) 7062306a36Sopenharmony_ci{ 7162306a36Sopenharmony_ci return xa_to_value(entry) >> DAX_SHIFT; 7262306a36Sopenharmony_ci} 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_cistatic void *dax_make_entry(pfn_t pfn, unsigned long flags) 7562306a36Sopenharmony_ci{ 7662306a36Sopenharmony_ci return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); 7762306a36Sopenharmony_ci} 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_cistatic bool dax_is_locked(void *entry) 8062306a36Sopenharmony_ci{ 8162306a36Sopenharmony_ci return xa_to_value(entry) & DAX_LOCKED; 8262306a36Sopenharmony_ci} 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_cistatic unsigned int dax_entry_order(void *entry) 8562306a36Sopenharmony_ci{ 8662306a36Sopenharmony_ci if (xa_to_value(entry) & DAX_PMD) 8762306a36Sopenharmony_ci return PMD_ORDER; 8862306a36Sopenharmony_ci return 0; 8962306a36Sopenharmony_ci} 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_cistatic unsigned long dax_is_pmd_entry(void *entry) 9262306a36Sopenharmony_ci{ 9362306a36Sopenharmony_ci return xa_to_value(entry) & DAX_PMD; 9462306a36Sopenharmony_ci} 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_cistatic bool dax_is_pte_entry(void *entry) 9762306a36Sopenharmony_ci{ 9862306a36Sopenharmony_ci return !(xa_to_value(entry) & DAX_PMD); 9962306a36Sopenharmony_ci} 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_cistatic int dax_is_zero_entry(void *entry) 10262306a36Sopenharmony_ci{ 10362306a36Sopenharmony_ci return xa_to_value(entry) & DAX_ZERO_PAGE; 10462306a36Sopenharmony_ci} 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_cistatic int dax_is_empty_entry(void *entry) 10762306a36Sopenharmony_ci{ 10862306a36Sopenharmony_ci return xa_to_value(entry) & DAX_EMPTY; 10962306a36Sopenharmony_ci} 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci/* 11262306a36Sopenharmony_ci * true if the entry that was found is of a smaller order than the entry 11362306a36Sopenharmony_ci * we were looking for 11462306a36Sopenharmony_ci */ 11562306a36Sopenharmony_cistatic bool dax_is_conflict(void *entry) 11662306a36Sopenharmony_ci{ 11762306a36Sopenharmony_ci return entry == XA_RETRY_ENTRY; 11862306a36Sopenharmony_ci} 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci/* 12162306a36Sopenharmony_ci * DAX page cache entry locking 12262306a36Sopenharmony_ci */ 12362306a36Sopenharmony_cistruct exceptional_entry_key { 12462306a36Sopenharmony_ci struct xarray *xa; 12562306a36Sopenharmony_ci pgoff_t entry_start; 12662306a36Sopenharmony_ci}; 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_cistruct wait_exceptional_entry_queue { 12962306a36Sopenharmony_ci wait_queue_entry_t wait; 13062306a36Sopenharmony_ci struct exceptional_entry_key key; 13162306a36Sopenharmony_ci}; 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci/** 13462306a36Sopenharmony_ci * enum dax_wake_mode: waitqueue wakeup behaviour 13562306a36Sopenharmony_ci * @WAKE_ALL: wake all waiters in the waitqueue 13662306a36Sopenharmony_ci * @WAKE_NEXT: wake only the first waiter in the waitqueue 13762306a36Sopenharmony_ci */ 13862306a36Sopenharmony_cienum dax_wake_mode { 13962306a36Sopenharmony_ci WAKE_ALL, 14062306a36Sopenharmony_ci WAKE_NEXT, 14162306a36Sopenharmony_ci}; 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_cistatic wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, 14462306a36Sopenharmony_ci void *entry, struct exceptional_entry_key *key) 14562306a36Sopenharmony_ci{ 14662306a36Sopenharmony_ci unsigned long hash; 14762306a36Sopenharmony_ci unsigned long index = xas->xa_index; 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci /* 15062306a36Sopenharmony_ci * If 'entry' is a PMD, align the 'index' that we use for the wait 15162306a36Sopenharmony_ci * queue to the start of that PMD. This ensures that all offsets in 15262306a36Sopenharmony_ci * the range covered by the PMD map to the same bit lock. 15362306a36Sopenharmony_ci */ 15462306a36Sopenharmony_ci if (dax_is_pmd_entry(entry)) 15562306a36Sopenharmony_ci index &= ~PG_PMD_COLOUR; 15662306a36Sopenharmony_ci key->xa = xas->xa; 15762306a36Sopenharmony_ci key->entry_start = index; 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_ci hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); 16062306a36Sopenharmony_ci return wait_table + hash; 16162306a36Sopenharmony_ci} 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_cistatic int wake_exceptional_entry_func(wait_queue_entry_t *wait, 16462306a36Sopenharmony_ci unsigned int mode, int sync, void *keyp) 16562306a36Sopenharmony_ci{ 16662306a36Sopenharmony_ci struct exceptional_entry_key *key = keyp; 16762306a36Sopenharmony_ci struct wait_exceptional_entry_queue *ewait = 16862306a36Sopenharmony_ci container_of(wait, struct wait_exceptional_entry_queue, wait); 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci if (key->xa != ewait->key.xa || 17162306a36Sopenharmony_ci key->entry_start != ewait->key.entry_start) 17262306a36Sopenharmony_ci return 0; 17362306a36Sopenharmony_ci return autoremove_wake_function(wait, mode, sync, NULL); 17462306a36Sopenharmony_ci} 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci/* 17762306a36Sopenharmony_ci * @entry may no longer be the entry at the index in the mapping. 17862306a36Sopenharmony_ci * The important information it's conveying is whether the entry at 17962306a36Sopenharmony_ci * this index used to be a PMD entry. 18062306a36Sopenharmony_ci */ 18162306a36Sopenharmony_cistatic void dax_wake_entry(struct xa_state *xas, void *entry, 18262306a36Sopenharmony_ci enum dax_wake_mode mode) 18362306a36Sopenharmony_ci{ 18462306a36Sopenharmony_ci struct exceptional_entry_key key; 18562306a36Sopenharmony_ci wait_queue_head_t *wq; 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci wq = dax_entry_waitqueue(xas, entry, &key); 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci /* 19062306a36Sopenharmony_ci * Checking for locked entry and prepare_to_wait_exclusive() happens 19162306a36Sopenharmony_ci * under the i_pages lock, ditto for entry handling in our callers. 19262306a36Sopenharmony_ci * So at this point all tasks that could have seen our entry locked 19362306a36Sopenharmony_ci * must be in the waitqueue and the following check will see them. 19462306a36Sopenharmony_ci */ 19562306a36Sopenharmony_ci if (waitqueue_active(wq)) 19662306a36Sopenharmony_ci __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); 19762306a36Sopenharmony_ci} 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci/* 20062306a36Sopenharmony_ci * Look up entry in page cache, wait for it to become unlocked if it 20162306a36Sopenharmony_ci * is a DAX entry and return it. The caller must subsequently call 20262306a36Sopenharmony_ci * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() 20362306a36Sopenharmony_ci * if it did. The entry returned may have a larger order than @order. 20462306a36Sopenharmony_ci * If @order is larger than the order of the entry found in i_pages, this 20562306a36Sopenharmony_ci * function returns a dax_is_conflict entry. 20662306a36Sopenharmony_ci * 20762306a36Sopenharmony_ci * Must be called with the i_pages lock held. 20862306a36Sopenharmony_ci */ 20962306a36Sopenharmony_cistatic void *get_unlocked_entry(struct xa_state *xas, unsigned int order) 21062306a36Sopenharmony_ci{ 21162306a36Sopenharmony_ci void *entry; 21262306a36Sopenharmony_ci struct wait_exceptional_entry_queue ewait; 21362306a36Sopenharmony_ci wait_queue_head_t *wq; 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci init_wait(&ewait.wait); 21662306a36Sopenharmony_ci ewait.wait.func = wake_exceptional_entry_func; 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_ci for (;;) { 21962306a36Sopenharmony_ci entry = xas_find_conflict(xas); 22062306a36Sopenharmony_ci if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 22162306a36Sopenharmony_ci return entry; 22262306a36Sopenharmony_ci if (dax_entry_order(entry) < order) 22362306a36Sopenharmony_ci return XA_RETRY_ENTRY; 22462306a36Sopenharmony_ci if (!dax_is_locked(entry)) 22562306a36Sopenharmony_ci return entry; 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci wq = dax_entry_waitqueue(xas, entry, &ewait.key); 22862306a36Sopenharmony_ci prepare_to_wait_exclusive(wq, &ewait.wait, 22962306a36Sopenharmony_ci TASK_UNINTERRUPTIBLE); 23062306a36Sopenharmony_ci xas_unlock_irq(xas); 23162306a36Sopenharmony_ci xas_reset(xas); 23262306a36Sopenharmony_ci schedule(); 23362306a36Sopenharmony_ci finish_wait(wq, &ewait.wait); 23462306a36Sopenharmony_ci xas_lock_irq(xas); 23562306a36Sopenharmony_ci } 23662306a36Sopenharmony_ci} 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci/* 23962306a36Sopenharmony_ci * The only thing keeping the address space around is the i_pages lock 24062306a36Sopenharmony_ci * (it's cycled in clear_inode() after removing the entries from i_pages) 24162306a36Sopenharmony_ci * After we call xas_unlock_irq(), we cannot touch xas->xa. 24262306a36Sopenharmony_ci */ 24362306a36Sopenharmony_cistatic void wait_entry_unlocked(struct xa_state *xas, void *entry) 24462306a36Sopenharmony_ci{ 24562306a36Sopenharmony_ci struct wait_exceptional_entry_queue ewait; 24662306a36Sopenharmony_ci wait_queue_head_t *wq; 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci init_wait(&ewait.wait); 24962306a36Sopenharmony_ci ewait.wait.func = wake_exceptional_entry_func; 25062306a36Sopenharmony_ci 25162306a36Sopenharmony_ci wq = dax_entry_waitqueue(xas, entry, &ewait.key); 25262306a36Sopenharmony_ci /* 25362306a36Sopenharmony_ci * Unlike get_unlocked_entry() there is no guarantee that this 25462306a36Sopenharmony_ci * path ever successfully retrieves an unlocked entry before an 25562306a36Sopenharmony_ci * inode dies. Perform a non-exclusive wait in case this path 25662306a36Sopenharmony_ci * never successfully performs its own wake up. 25762306a36Sopenharmony_ci */ 25862306a36Sopenharmony_ci prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); 25962306a36Sopenharmony_ci xas_unlock_irq(xas); 26062306a36Sopenharmony_ci schedule(); 26162306a36Sopenharmony_ci finish_wait(wq, &ewait.wait); 26262306a36Sopenharmony_ci} 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_cistatic void put_unlocked_entry(struct xa_state *xas, void *entry, 26562306a36Sopenharmony_ci enum dax_wake_mode mode) 26662306a36Sopenharmony_ci{ 26762306a36Sopenharmony_ci if (entry && !dax_is_conflict(entry)) 26862306a36Sopenharmony_ci dax_wake_entry(xas, entry, mode); 26962306a36Sopenharmony_ci} 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci/* 27262306a36Sopenharmony_ci * We used the xa_state to get the entry, but then we locked the entry and 27362306a36Sopenharmony_ci * dropped the xa_lock, so we know the xa_state is stale and must be reset 27462306a36Sopenharmony_ci * before use. 27562306a36Sopenharmony_ci */ 27662306a36Sopenharmony_cistatic void dax_unlock_entry(struct xa_state *xas, void *entry) 27762306a36Sopenharmony_ci{ 27862306a36Sopenharmony_ci void *old; 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci BUG_ON(dax_is_locked(entry)); 28162306a36Sopenharmony_ci xas_reset(xas); 28262306a36Sopenharmony_ci xas_lock_irq(xas); 28362306a36Sopenharmony_ci old = xas_store(xas, entry); 28462306a36Sopenharmony_ci xas_unlock_irq(xas); 28562306a36Sopenharmony_ci BUG_ON(!dax_is_locked(old)); 28662306a36Sopenharmony_ci dax_wake_entry(xas, entry, WAKE_NEXT); 28762306a36Sopenharmony_ci} 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci/* 29062306a36Sopenharmony_ci * Return: The entry stored at this location before it was locked. 29162306a36Sopenharmony_ci */ 29262306a36Sopenharmony_cistatic void *dax_lock_entry(struct xa_state *xas, void *entry) 29362306a36Sopenharmony_ci{ 29462306a36Sopenharmony_ci unsigned long v = xa_to_value(entry); 29562306a36Sopenharmony_ci return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); 29662306a36Sopenharmony_ci} 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_cistatic unsigned long dax_entry_size(void *entry) 29962306a36Sopenharmony_ci{ 30062306a36Sopenharmony_ci if (dax_is_zero_entry(entry)) 30162306a36Sopenharmony_ci return 0; 30262306a36Sopenharmony_ci else if (dax_is_empty_entry(entry)) 30362306a36Sopenharmony_ci return 0; 30462306a36Sopenharmony_ci else if (dax_is_pmd_entry(entry)) 30562306a36Sopenharmony_ci return PMD_SIZE; 30662306a36Sopenharmony_ci else 30762306a36Sopenharmony_ci return PAGE_SIZE; 30862306a36Sopenharmony_ci} 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_cistatic unsigned long dax_end_pfn(void *entry) 31162306a36Sopenharmony_ci{ 31262306a36Sopenharmony_ci return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; 31362306a36Sopenharmony_ci} 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci/* 31662306a36Sopenharmony_ci * Iterate through all mapped pfns represented by an entry, i.e. skip 31762306a36Sopenharmony_ci * 'empty' and 'zero' entries. 31862306a36Sopenharmony_ci */ 31962306a36Sopenharmony_ci#define for_each_mapped_pfn(entry, pfn) \ 32062306a36Sopenharmony_ci for (pfn = dax_to_pfn(entry); \ 32162306a36Sopenharmony_ci pfn < dax_end_pfn(entry); pfn++) 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_cistatic inline bool dax_page_is_shared(struct page *page) 32462306a36Sopenharmony_ci{ 32562306a36Sopenharmony_ci return page->mapping == PAGE_MAPPING_DAX_SHARED; 32662306a36Sopenharmony_ci} 32762306a36Sopenharmony_ci 32862306a36Sopenharmony_ci/* 32962306a36Sopenharmony_ci * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the 33062306a36Sopenharmony_ci * refcount. 33162306a36Sopenharmony_ci */ 33262306a36Sopenharmony_cistatic inline void dax_page_share_get(struct page *page) 33362306a36Sopenharmony_ci{ 33462306a36Sopenharmony_ci if (page->mapping != PAGE_MAPPING_DAX_SHARED) { 33562306a36Sopenharmony_ci /* 33662306a36Sopenharmony_ci * Reset the index if the page was already mapped 33762306a36Sopenharmony_ci * regularly before. 33862306a36Sopenharmony_ci */ 33962306a36Sopenharmony_ci if (page->mapping) 34062306a36Sopenharmony_ci page->share = 1; 34162306a36Sopenharmony_ci page->mapping = PAGE_MAPPING_DAX_SHARED; 34262306a36Sopenharmony_ci } 34362306a36Sopenharmony_ci page->share++; 34462306a36Sopenharmony_ci} 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_cistatic inline unsigned long dax_page_share_put(struct page *page) 34762306a36Sopenharmony_ci{ 34862306a36Sopenharmony_ci return --page->share; 34962306a36Sopenharmony_ci} 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_ci/* 35262306a36Sopenharmony_ci * When it is called in dax_insert_entry(), the shared flag will indicate that 35362306a36Sopenharmony_ci * whether this entry is shared by multiple files. If so, set the page->mapping 35462306a36Sopenharmony_ci * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. 35562306a36Sopenharmony_ci */ 35662306a36Sopenharmony_cistatic void dax_associate_entry(void *entry, struct address_space *mapping, 35762306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long address, bool shared) 35862306a36Sopenharmony_ci{ 35962306a36Sopenharmony_ci unsigned long size = dax_entry_size(entry), pfn, index; 36062306a36Sopenharmony_ci int i = 0; 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 36362306a36Sopenharmony_ci return; 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci index = linear_page_index(vma, address & ~(size - 1)); 36662306a36Sopenharmony_ci for_each_mapped_pfn(entry, pfn) { 36762306a36Sopenharmony_ci struct page *page = pfn_to_page(pfn); 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci if (shared) { 37062306a36Sopenharmony_ci dax_page_share_get(page); 37162306a36Sopenharmony_ci } else { 37262306a36Sopenharmony_ci WARN_ON_ONCE(page->mapping); 37362306a36Sopenharmony_ci page->mapping = mapping; 37462306a36Sopenharmony_ci page->index = index + i++; 37562306a36Sopenharmony_ci } 37662306a36Sopenharmony_ci } 37762306a36Sopenharmony_ci} 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_cistatic void dax_disassociate_entry(void *entry, struct address_space *mapping, 38062306a36Sopenharmony_ci bool trunc) 38162306a36Sopenharmony_ci{ 38262306a36Sopenharmony_ci unsigned long pfn; 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 38562306a36Sopenharmony_ci return; 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci for_each_mapped_pfn(entry, pfn) { 38862306a36Sopenharmony_ci struct page *page = pfn_to_page(pfn); 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci WARN_ON_ONCE(trunc && page_ref_count(page) > 1); 39162306a36Sopenharmony_ci if (dax_page_is_shared(page)) { 39262306a36Sopenharmony_ci /* keep the shared flag if this page is still shared */ 39362306a36Sopenharmony_ci if (dax_page_share_put(page) > 0) 39462306a36Sopenharmony_ci continue; 39562306a36Sopenharmony_ci } else 39662306a36Sopenharmony_ci WARN_ON_ONCE(page->mapping && page->mapping != mapping); 39762306a36Sopenharmony_ci page->mapping = NULL; 39862306a36Sopenharmony_ci page->index = 0; 39962306a36Sopenharmony_ci } 40062306a36Sopenharmony_ci} 40162306a36Sopenharmony_ci 40262306a36Sopenharmony_cistatic struct page *dax_busy_page(void *entry) 40362306a36Sopenharmony_ci{ 40462306a36Sopenharmony_ci unsigned long pfn; 40562306a36Sopenharmony_ci 40662306a36Sopenharmony_ci for_each_mapped_pfn(entry, pfn) { 40762306a36Sopenharmony_ci struct page *page = pfn_to_page(pfn); 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci if (page_ref_count(page) > 1) 41062306a36Sopenharmony_ci return page; 41162306a36Sopenharmony_ci } 41262306a36Sopenharmony_ci return NULL; 41362306a36Sopenharmony_ci} 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci/** 41662306a36Sopenharmony_ci * dax_lock_folio - Lock the DAX entry corresponding to a folio 41762306a36Sopenharmony_ci * @folio: The folio whose entry we want to lock 41862306a36Sopenharmony_ci * 41962306a36Sopenharmony_ci * Context: Process context. 42062306a36Sopenharmony_ci * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could 42162306a36Sopenharmony_ci * not be locked. 42262306a36Sopenharmony_ci */ 42362306a36Sopenharmony_cidax_entry_t dax_lock_folio(struct folio *folio) 42462306a36Sopenharmony_ci{ 42562306a36Sopenharmony_ci XA_STATE(xas, NULL, 0); 42662306a36Sopenharmony_ci void *entry; 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_ci /* Ensure folio->mapping isn't freed while we look at it */ 42962306a36Sopenharmony_ci rcu_read_lock(); 43062306a36Sopenharmony_ci for (;;) { 43162306a36Sopenharmony_ci struct address_space *mapping = READ_ONCE(folio->mapping); 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci entry = NULL; 43462306a36Sopenharmony_ci if (!mapping || !dax_mapping(mapping)) 43562306a36Sopenharmony_ci break; 43662306a36Sopenharmony_ci 43762306a36Sopenharmony_ci /* 43862306a36Sopenharmony_ci * In the device-dax case there's no need to lock, a 43962306a36Sopenharmony_ci * struct dev_pagemap pin is sufficient to keep the 44062306a36Sopenharmony_ci * inode alive, and we assume we have dev_pagemap pin 44162306a36Sopenharmony_ci * otherwise we would not have a valid pfn_to_page() 44262306a36Sopenharmony_ci * translation. 44362306a36Sopenharmony_ci */ 44462306a36Sopenharmony_ci entry = (void *)~0UL; 44562306a36Sopenharmony_ci if (S_ISCHR(mapping->host->i_mode)) 44662306a36Sopenharmony_ci break; 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci xas.xa = &mapping->i_pages; 44962306a36Sopenharmony_ci xas_lock_irq(&xas); 45062306a36Sopenharmony_ci if (mapping != folio->mapping) { 45162306a36Sopenharmony_ci xas_unlock_irq(&xas); 45262306a36Sopenharmony_ci continue; 45362306a36Sopenharmony_ci } 45462306a36Sopenharmony_ci xas_set(&xas, folio->index); 45562306a36Sopenharmony_ci entry = xas_load(&xas); 45662306a36Sopenharmony_ci if (dax_is_locked(entry)) { 45762306a36Sopenharmony_ci rcu_read_unlock(); 45862306a36Sopenharmony_ci wait_entry_unlocked(&xas, entry); 45962306a36Sopenharmony_ci rcu_read_lock(); 46062306a36Sopenharmony_ci continue; 46162306a36Sopenharmony_ci } 46262306a36Sopenharmony_ci dax_lock_entry(&xas, entry); 46362306a36Sopenharmony_ci xas_unlock_irq(&xas); 46462306a36Sopenharmony_ci break; 46562306a36Sopenharmony_ci } 46662306a36Sopenharmony_ci rcu_read_unlock(); 46762306a36Sopenharmony_ci return (dax_entry_t)entry; 46862306a36Sopenharmony_ci} 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_civoid dax_unlock_folio(struct folio *folio, dax_entry_t cookie) 47162306a36Sopenharmony_ci{ 47262306a36Sopenharmony_ci struct address_space *mapping = folio->mapping; 47362306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, folio->index); 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_ci if (S_ISCHR(mapping->host->i_mode)) 47662306a36Sopenharmony_ci return; 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci dax_unlock_entry(&xas, (void *)cookie); 47962306a36Sopenharmony_ci} 48062306a36Sopenharmony_ci 48162306a36Sopenharmony_ci/* 48262306a36Sopenharmony_ci * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping 48362306a36Sopenharmony_ci * @mapping: the file's mapping whose entry we want to lock 48462306a36Sopenharmony_ci * @index: the offset within this file 48562306a36Sopenharmony_ci * @page: output the dax page corresponding to this dax entry 48662306a36Sopenharmony_ci * 48762306a36Sopenharmony_ci * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry 48862306a36Sopenharmony_ci * could not be locked. 48962306a36Sopenharmony_ci */ 49062306a36Sopenharmony_cidax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, 49162306a36Sopenharmony_ci struct page **page) 49262306a36Sopenharmony_ci{ 49362306a36Sopenharmony_ci XA_STATE(xas, NULL, 0); 49462306a36Sopenharmony_ci void *entry; 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_ci rcu_read_lock(); 49762306a36Sopenharmony_ci for (;;) { 49862306a36Sopenharmony_ci entry = NULL; 49962306a36Sopenharmony_ci if (!dax_mapping(mapping)) 50062306a36Sopenharmony_ci break; 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci xas.xa = &mapping->i_pages; 50362306a36Sopenharmony_ci xas_lock_irq(&xas); 50462306a36Sopenharmony_ci xas_set(&xas, index); 50562306a36Sopenharmony_ci entry = xas_load(&xas); 50662306a36Sopenharmony_ci if (dax_is_locked(entry)) { 50762306a36Sopenharmony_ci rcu_read_unlock(); 50862306a36Sopenharmony_ci wait_entry_unlocked(&xas, entry); 50962306a36Sopenharmony_ci rcu_read_lock(); 51062306a36Sopenharmony_ci continue; 51162306a36Sopenharmony_ci } 51262306a36Sopenharmony_ci if (!entry || 51362306a36Sopenharmony_ci dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 51462306a36Sopenharmony_ci /* 51562306a36Sopenharmony_ci * Because we are looking for entry from file's mapping 51662306a36Sopenharmony_ci * and index, so the entry may not be inserted for now, 51762306a36Sopenharmony_ci * or even a zero/empty entry. We don't think this is 51862306a36Sopenharmony_ci * an error case. So, return a special value and do 51962306a36Sopenharmony_ci * not output @page. 52062306a36Sopenharmony_ci */ 52162306a36Sopenharmony_ci entry = (void *)~0UL; 52262306a36Sopenharmony_ci } else { 52362306a36Sopenharmony_ci *page = pfn_to_page(dax_to_pfn(entry)); 52462306a36Sopenharmony_ci dax_lock_entry(&xas, entry); 52562306a36Sopenharmony_ci } 52662306a36Sopenharmony_ci xas_unlock_irq(&xas); 52762306a36Sopenharmony_ci break; 52862306a36Sopenharmony_ci } 52962306a36Sopenharmony_ci rcu_read_unlock(); 53062306a36Sopenharmony_ci return (dax_entry_t)entry; 53162306a36Sopenharmony_ci} 53262306a36Sopenharmony_ci 53362306a36Sopenharmony_civoid dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index, 53462306a36Sopenharmony_ci dax_entry_t cookie) 53562306a36Sopenharmony_ci{ 53662306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, index); 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_ci if (cookie == ~0UL) 53962306a36Sopenharmony_ci return; 54062306a36Sopenharmony_ci 54162306a36Sopenharmony_ci dax_unlock_entry(&xas, (void *)cookie); 54262306a36Sopenharmony_ci} 54362306a36Sopenharmony_ci 54462306a36Sopenharmony_ci/* 54562306a36Sopenharmony_ci * Find page cache entry at given index. If it is a DAX entry, return it 54662306a36Sopenharmony_ci * with the entry locked. If the page cache doesn't contain an entry at 54762306a36Sopenharmony_ci * that index, add a locked empty entry. 54862306a36Sopenharmony_ci * 54962306a36Sopenharmony_ci * When requesting an entry with size DAX_PMD, grab_mapping_entry() will 55062306a36Sopenharmony_ci * either return that locked entry or will return VM_FAULT_FALLBACK. 55162306a36Sopenharmony_ci * This will happen if there are any PTE entries within the PMD range 55262306a36Sopenharmony_ci * that we are requesting. 55362306a36Sopenharmony_ci * 55462306a36Sopenharmony_ci * We always favor PTE entries over PMD entries. There isn't a flow where we 55562306a36Sopenharmony_ci * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD 55662306a36Sopenharmony_ci * insertion will fail if it finds any PTE entries already in the tree, and a 55762306a36Sopenharmony_ci * PTE insertion will cause an existing PMD entry to be unmapped and 55862306a36Sopenharmony_ci * downgraded to PTE entries. This happens for both PMD zero pages as 55962306a36Sopenharmony_ci * well as PMD empty entries. 56062306a36Sopenharmony_ci * 56162306a36Sopenharmony_ci * The exception to this downgrade path is for PMD entries that have 56262306a36Sopenharmony_ci * real storage backing them. We will leave these real PMD entries in 56362306a36Sopenharmony_ci * the tree, and PTE writes will simply dirty the entire PMD entry. 56462306a36Sopenharmony_ci * 56562306a36Sopenharmony_ci * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 56662306a36Sopenharmony_ci * persistent memory the benefit is doubtful. We can add that later if we can 56762306a36Sopenharmony_ci * show it helps. 56862306a36Sopenharmony_ci * 56962306a36Sopenharmony_ci * On error, this function does not return an ERR_PTR. Instead it returns 57062306a36Sopenharmony_ci * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values 57162306a36Sopenharmony_ci * overlap with xarray value entries. 57262306a36Sopenharmony_ci */ 57362306a36Sopenharmony_cistatic void *grab_mapping_entry(struct xa_state *xas, 57462306a36Sopenharmony_ci struct address_space *mapping, unsigned int order) 57562306a36Sopenharmony_ci{ 57662306a36Sopenharmony_ci unsigned long index = xas->xa_index; 57762306a36Sopenharmony_ci bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ 57862306a36Sopenharmony_ci void *entry; 57962306a36Sopenharmony_ci 58062306a36Sopenharmony_ciretry: 58162306a36Sopenharmony_ci pmd_downgrade = false; 58262306a36Sopenharmony_ci xas_lock_irq(xas); 58362306a36Sopenharmony_ci entry = get_unlocked_entry(xas, order); 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci if (entry) { 58662306a36Sopenharmony_ci if (dax_is_conflict(entry)) 58762306a36Sopenharmony_ci goto fallback; 58862306a36Sopenharmony_ci if (!xa_is_value(entry)) { 58962306a36Sopenharmony_ci xas_set_err(xas, -EIO); 59062306a36Sopenharmony_ci goto out_unlock; 59162306a36Sopenharmony_ci } 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_ci if (order == 0) { 59462306a36Sopenharmony_ci if (dax_is_pmd_entry(entry) && 59562306a36Sopenharmony_ci (dax_is_zero_entry(entry) || 59662306a36Sopenharmony_ci dax_is_empty_entry(entry))) { 59762306a36Sopenharmony_ci pmd_downgrade = true; 59862306a36Sopenharmony_ci } 59962306a36Sopenharmony_ci } 60062306a36Sopenharmony_ci } 60162306a36Sopenharmony_ci 60262306a36Sopenharmony_ci if (pmd_downgrade) { 60362306a36Sopenharmony_ci /* 60462306a36Sopenharmony_ci * Make sure 'entry' remains valid while we drop 60562306a36Sopenharmony_ci * the i_pages lock. 60662306a36Sopenharmony_ci */ 60762306a36Sopenharmony_ci dax_lock_entry(xas, entry); 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_ci /* 61062306a36Sopenharmony_ci * Besides huge zero pages the only other thing that gets 61162306a36Sopenharmony_ci * downgraded are empty entries which don't need to be 61262306a36Sopenharmony_ci * unmapped. 61362306a36Sopenharmony_ci */ 61462306a36Sopenharmony_ci if (dax_is_zero_entry(entry)) { 61562306a36Sopenharmony_ci xas_unlock_irq(xas); 61662306a36Sopenharmony_ci unmap_mapping_pages(mapping, 61762306a36Sopenharmony_ci xas->xa_index & ~PG_PMD_COLOUR, 61862306a36Sopenharmony_ci PG_PMD_NR, false); 61962306a36Sopenharmony_ci xas_reset(xas); 62062306a36Sopenharmony_ci xas_lock_irq(xas); 62162306a36Sopenharmony_ci } 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_ci dax_disassociate_entry(entry, mapping, false); 62462306a36Sopenharmony_ci xas_store(xas, NULL); /* undo the PMD join */ 62562306a36Sopenharmony_ci dax_wake_entry(xas, entry, WAKE_ALL); 62662306a36Sopenharmony_ci mapping->nrpages -= PG_PMD_NR; 62762306a36Sopenharmony_ci entry = NULL; 62862306a36Sopenharmony_ci xas_set(xas, index); 62962306a36Sopenharmony_ci } 63062306a36Sopenharmony_ci 63162306a36Sopenharmony_ci if (entry) { 63262306a36Sopenharmony_ci dax_lock_entry(xas, entry); 63362306a36Sopenharmony_ci } else { 63462306a36Sopenharmony_ci unsigned long flags = DAX_EMPTY; 63562306a36Sopenharmony_ci 63662306a36Sopenharmony_ci if (order > 0) 63762306a36Sopenharmony_ci flags |= DAX_PMD; 63862306a36Sopenharmony_ci entry = dax_make_entry(pfn_to_pfn_t(0), flags); 63962306a36Sopenharmony_ci dax_lock_entry(xas, entry); 64062306a36Sopenharmony_ci if (xas_error(xas)) 64162306a36Sopenharmony_ci goto out_unlock; 64262306a36Sopenharmony_ci mapping->nrpages += 1UL << order; 64362306a36Sopenharmony_ci } 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ciout_unlock: 64662306a36Sopenharmony_ci xas_unlock_irq(xas); 64762306a36Sopenharmony_ci if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) 64862306a36Sopenharmony_ci goto retry; 64962306a36Sopenharmony_ci if (xas->xa_node == XA_ERROR(-ENOMEM)) 65062306a36Sopenharmony_ci return xa_mk_internal(VM_FAULT_OOM); 65162306a36Sopenharmony_ci if (xas_error(xas)) 65262306a36Sopenharmony_ci return xa_mk_internal(VM_FAULT_SIGBUS); 65362306a36Sopenharmony_ci return entry; 65462306a36Sopenharmony_cifallback: 65562306a36Sopenharmony_ci xas_unlock_irq(xas); 65662306a36Sopenharmony_ci return xa_mk_internal(VM_FAULT_FALLBACK); 65762306a36Sopenharmony_ci} 65862306a36Sopenharmony_ci 65962306a36Sopenharmony_ci/** 66062306a36Sopenharmony_ci * dax_layout_busy_page_range - find first pinned page in @mapping 66162306a36Sopenharmony_ci * @mapping: address space to scan for a page with ref count > 1 66262306a36Sopenharmony_ci * @start: Starting offset. Page containing 'start' is included. 66362306a36Sopenharmony_ci * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, 66462306a36Sopenharmony_ci * pages from 'start' till the end of file are included. 66562306a36Sopenharmony_ci * 66662306a36Sopenharmony_ci * DAX requires ZONE_DEVICE mapped pages. These pages are never 66762306a36Sopenharmony_ci * 'onlined' to the page allocator so they are considered idle when 66862306a36Sopenharmony_ci * page->count == 1. A filesystem uses this interface to determine if 66962306a36Sopenharmony_ci * any page in the mapping is busy, i.e. for DMA, or other 67062306a36Sopenharmony_ci * get_user_pages() usages. 67162306a36Sopenharmony_ci * 67262306a36Sopenharmony_ci * It is expected that the filesystem is holding locks to block the 67362306a36Sopenharmony_ci * establishment of new mappings in this address_space. I.e. it expects 67462306a36Sopenharmony_ci * to be able to run unmap_mapping_range() and subsequently not race 67562306a36Sopenharmony_ci * mapping_mapped() becoming true. 67662306a36Sopenharmony_ci */ 67762306a36Sopenharmony_cistruct page *dax_layout_busy_page_range(struct address_space *mapping, 67862306a36Sopenharmony_ci loff_t start, loff_t end) 67962306a36Sopenharmony_ci{ 68062306a36Sopenharmony_ci void *entry; 68162306a36Sopenharmony_ci unsigned int scanned = 0; 68262306a36Sopenharmony_ci struct page *page = NULL; 68362306a36Sopenharmony_ci pgoff_t start_idx = start >> PAGE_SHIFT; 68462306a36Sopenharmony_ci pgoff_t end_idx; 68562306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, start_idx); 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci /* 68862306a36Sopenharmony_ci * In the 'limited' case get_user_pages() for dax is disabled. 68962306a36Sopenharmony_ci */ 69062306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 69162306a36Sopenharmony_ci return NULL; 69262306a36Sopenharmony_ci 69362306a36Sopenharmony_ci if (!dax_mapping(mapping) || !mapping_mapped(mapping)) 69462306a36Sopenharmony_ci return NULL; 69562306a36Sopenharmony_ci 69662306a36Sopenharmony_ci /* If end == LLONG_MAX, all pages from start to till end of file */ 69762306a36Sopenharmony_ci if (end == LLONG_MAX) 69862306a36Sopenharmony_ci end_idx = ULONG_MAX; 69962306a36Sopenharmony_ci else 70062306a36Sopenharmony_ci end_idx = end >> PAGE_SHIFT; 70162306a36Sopenharmony_ci /* 70262306a36Sopenharmony_ci * If we race get_user_pages_fast() here either we'll see the 70362306a36Sopenharmony_ci * elevated page count in the iteration and wait, or 70462306a36Sopenharmony_ci * get_user_pages_fast() will see that the page it took a reference 70562306a36Sopenharmony_ci * against is no longer mapped in the page tables and bail to the 70662306a36Sopenharmony_ci * get_user_pages() slow path. The slow path is protected by 70762306a36Sopenharmony_ci * pte_lock() and pmd_lock(). New references are not taken without 70862306a36Sopenharmony_ci * holding those locks, and unmap_mapping_pages() will not zero the 70962306a36Sopenharmony_ci * pte or pmd without holding the respective lock, so we are 71062306a36Sopenharmony_ci * guaranteed to either see new references or prevent new 71162306a36Sopenharmony_ci * references from being established. 71262306a36Sopenharmony_ci */ 71362306a36Sopenharmony_ci unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); 71462306a36Sopenharmony_ci 71562306a36Sopenharmony_ci xas_lock_irq(&xas); 71662306a36Sopenharmony_ci xas_for_each(&xas, entry, end_idx) { 71762306a36Sopenharmony_ci if (WARN_ON_ONCE(!xa_is_value(entry))) 71862306a36Sopenharmony_ci continue; 71962306a36Sopenharmony_ci if (unlikely(dax_is_locked(entry))) 72062306a36Sopenharmony_ci entry = get_unlocked_entry(&xas, 0); 72162306a36Sopenharmony_ci if (entry) 72262306a36Sopenharmony_ci page = dax_busy_page(entry); 72362306a36Sopenharmony_ci put_unlocked_entry(&xas, entry, WAKE_NEXT); 72462306a36Sopenharmony_ci if (page) 72562306a36Sopenharmony_ci break; 72662306a36Sopenharmony_ci if (++scanned % XA_CHECK_SCHED) 72762306a36Sopenharmony_ci continue; 72862306a36Sopenharmony_ci 72962306a36Sopenharmony_ci xas_pause(&xas); 73062306a36Sopenharmony_ci xas_unlock_irq(&xas); 73162306a36Sopenharmony_ci cond_resched(); 73262306a36Sopenharmony_ci xas_lock_irq(&xas); 73362306a36Sopenharmony_ci } 73462306a36Sopenharmony_ci xas_unlock_irq(&xas); 73562306a36Sopenharmony_ci return page; 73662306a36Sopenharmony_ci} 73762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_layout_busy_page_range); 73862306a36Sopenharmony_ci 73962306a36Sopenharmony_cistruct page *dax_layout_busy_page(struct address_space *mapping) 74062306a36Sopenharmony_ci{ 74162306a36Sopenharmony_ci return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); 74262306a36Sopenharmony_ci} 74362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_layout_busy_page); 74462306a36Sopenharmony_ci 74562306a36Sopenharmony_cistatic int __dax_invalidate_entry(struct address_space *mapping, 74662306a36Sopenharmony_ci pgoff_t index, bool trunc) 74762306a36Sopenharmony_ci{ 74862306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, index); 74962306a36Sopenharmony_ci int ret = 0; 75062306a36Sopenharmony_ci void *entry; 75162306a36Sopenharmony_ci 75262306a36Sopenharmony_ci xas_lock_irq(&xas); 75362306a36Sopenharmony_ci entry = get_unlocked_entry(&xas, 0); 75462306a36Sopenharmony_ci if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 75562306a36Sopenharmony_ci goto out; 75662306a36Sopenharmony_ci if (!trunc && 75762306a36Sopenharmony_ci (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || 75862306a36Sopenharmony_ci xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) 75962306a36Sopenharmony_ci goto out; 76062306a36Sopenharmony_ci dax_disassociate_entry(entry, mapping, trunc); 76162306a36Sopenharmony_ci xas_store(&xas, NULL); 76262306a36Sopenharmony_ci mapping->nrpages -= 1UL << dax_entry_order(entry); 76362306a36Sopenharmony_ci ret = 1; 76462306a36Sopenharmony_ciout: 76562306a36Sopenharmony_ci put_unlocked_entry(&xas, entry, WAKE_ALL); 76662306a36Sopenharmony_ci xas_unlock_irq(&xas); 76762306a36Sopenharmony_ci return ret; 76862306a36Sopenharmony_ci} 76962306a36Sopenharmony_ci 77062306a36Sopenharmony_cistatic int __dax_clear_dirty_range(struct address_space *mapping, 77162306a36Sopenharmony_ci pgoff_t start, pgoff_t end) 77262306a36Sopenharmony_ci{ 77362306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, start); 77462306a36Sopenharmony_ci unsigned int scanned = 0; 77562306a36Sopenharmony_ci void *entry; 77662306a36Sopenharmony_ci 77762306a36Sopenharmony_ci xas_lock_irq(&xas); 77862306a36Sopenharmony_ci xas_for_each(&xas, entry, end) { 77962306a36Sopenharmony_ci entry = get_unlocked_entry(&xas, 0); 78062306a36Sopenharmony_ci xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); 78162306a36Sopenharmony_ci xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 78262306a36Sopenharmony_ci put_unlocked_entry(&xas, entry, WAKE_NEXT); 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_ci if (++scanned % XA_CHECK_SCHED) 78562306a36Sopenharmony_ci continue; 78662306a36Sopenharmony_ci 78762306a36Sopenharmony_ci xas_pause(&xas); 78862306a36Sopenharmony_ci xas_unlock_irq(&xas); 78962306a36Sopenharmony_ci cond_resched(); 79062306a36Sopenharmony_ci xas_lock_irq(&xas); 79162306a36Sopenharmony_ci } 79262306a36Sopenharmony_ci xas_unlock_irq(&xas); 79362306a36Sopenharmony_ci 79462306a36Sopenharmony_ci return 0; 79562306a36Sopenharmony_ci} 79662306a36Sopenharmony_ci 79762306a36Sopenharmony_ci/* 79862306a36Sopenharmony_ci * Delete DAX entry at @index from @mapping. Wait for it 79962306a36Sopenharmony_ci * to be unlocked before deleting it. 80062306a36Sopenharmony_ci */ 80162306a36Sopenharmony_ciint dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 80262306a36Sopenharmony_ci{ 80362306a36Sopenharmony_ci int ret = __dax_invalidate_entry(mapping, index, true); 80462306a36Sopenharmony_ci 80562306a36Sopenharmony_ci /* 80662306a36Sopenharmony_ci * This gets called from truncate / punch_hole path. As such, the caller 80762306a36Sopenharmony_ci * must hold locks protecting against concurrent modifications of the 80862306a36Sopenharmony_ci * page cache (usually fs-private i_mmap_sem for writing). Since the 80962306a36Sopenharmony_ci * caller has seen a DAX entry for this index, we better find it 81062306a36Sopenharmony_ci * at that index as well... 81162306a36Sopenharmony_ci */ 81262306a36Sopenharmony_ci WARN_ON_ONCE(!ret); 81362306a36Sopenharmony_ci return ret; 81462306a36Sopenharmony_ci} 81562306a36Sopenharmony_ci 81662306a36Sopenharmony_ci/* 81762306a36Sopenharmony_ci * Invalidate DAX entry if it is clean. 81862306a36Sopenharmony_ci */ 81962306a36Sopenharmony_ciint dax_invalidate_mapping_entry_sync(struct address_space *mapping, 82062306a36Sopenharmony_ci pgoff_t index) 82162306a36Sopenharmony_ci{ 82262306a36Sopenharmony_ci return __dax_invalidate_entry(mapping, index, false); 82362306a36Sopenharmony_ci} 82462306a36Sopenharmony_ci 82562306a36Sopenharmony_cistatic pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos) 82662306a36Sopenharmony_ci{ 82762306a36Sopenharmony_ci return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset); 82862306a36Sopenharmony_ci} 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_cistatic int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter) 83162306a36Sopenharmony_ci{ 83262306a36Sopenharmony_ci pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos); 83362306a36Sopenharmony_ci void *vto, *kaddr; 83462306a36Sopenharmony_ci long rc; 83562306a36Sopenharmony_ci int id; 83662306a36Sopenharmony_ci 83762306a36Sopenharmony_ci id = dax_read_lock(); 83862306a36Sopenharmony_ci rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, 83962306a36Sopenharmony_ci &kaddr, NULL); 84062306a36Sopenharmony_ci if (rc < 0) { 84162306a36Sopenharmony_ci dax_read_unlock(id); 84262306a36Sopenharmony_ci return rc; 84362306a36Sopenharmony_ci } 84462306a36Sopenharmony_ci vto = kmap_atomic(vmf->cow_page); 84562306a36Sopenharmony_ci copy_user_page(vto, kaddr, vmf->address, vmf->cow_page); 84662306a36Sopenharmony_ci kunmap_atomic(vto); 84762306a36Sopenharmony_ci dax_read_unlock(id); 84862306a36Sopenharmony_ci return 0; 84962306a36Sopenharmony_ci} 85062306a36Sopenharmony_ci 85162306a36Sopenharmony_ci/* 85262306a36Sopenharmony_ci * MAP_SYNC on a dax mapping guarantees dirty metadata is 85362306a36Sopenharmony_ci * flushed on write-faults (non-cow), but not read-faults. 85462306a36Sopenharmony_ci */ 85562306a36Sopenharmony_cistatic bool dax_fault_is_synchronous(const struct iomap_iter *iter, 85662306a36Sopenharmony_ci struct vm_area_struct *vma) 85762306a36Sopenharmony_ci{ 85862306a36Sopenharmony_ci return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) && 85962306a36Sopenharmony_ci (iter->iomap.flags & IOMAP_F_DIRTY); 86062306a36Sopenharmony_ci} 86162306a36Sopenharmony_ci 86262306a36Sopenharmony_ci/* 86362306a36Sopenharmony_ci * By this point grab_mapping_entry() has ensured that we have a locked entry 86462306a36Sopenharmony_ci * of the appropriate size so we don't have to worry about downgrading PMDs to 86562306a36Sopenharmony_ci * PTEs. If we happen to be trying to insert a PTE and there is a PMD 86662306a36Sopenharmony_ci * already in the tree, we will skip the insertion and just dirty the PMD as 86762306a36Sopenharmony_ci * appropriate. 86862306a36Sopenharmony_ci */ 86962306a36Sopenharmony_cistatic void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, 87062306a36Sopenharmony_ci const struct iomap_iter *iter, void *entry, pfn_t pfn, 87162306a36Sopenharmony_ci unsigned long flags) 87262306a36Sopenharmony_ci{ 87362306a36Sopenharmony_ci struct address_space *mapping = vmf->vma->vm_file->f_mapping; 87462306a36Sopenharmony_ci void *new_entry = dax_make_entry(pfn, flags); 87562306a36Sopenharmony_ci bool write = iter->flags & IOMAP_WRITE; 87662306a36Sopenharmony_ci bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); 87762306a36Sopenharmony_ci bool shared = iter->iomap.flags & IOMAP_F_SHARED; 87862306a36Sopenharmony_ci 87962306a36Sopenharmony_ci if (dirty) 88062306a36Sopenharmony_ci __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 88162306a36Sopenharmony_ci 88262306a36Sopenharmony_ci if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { 88362306a36Sopenharmony_ci unsigned long index = xas->xa_index; 88462306a36Sopenharmony_ci /* we are replacing a zero page with block mapping */ 88562306a36Sopenharmony_ci if (dax_is_pmd_entry(entry)) 88662306a36Sopenharmony_ci unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, 88762306a36Sopenharmony_ci PG_PMD_NR, false); 88862306a36Sopenharmony_ci else /* pte entry */ 88962306a36Sopenharmony_ci unmap_mapping_pages(mapping, index, 1, false); 89062306a36Sopenharmony_ci } 89162306a36Sopenharmony_ci 89262306a36Sopenharmony_ci xas_reset(xas); 89362306a36Sopenharmony_ci xas_lock_irq(xas); 89462306a36Sopenharmony_ci if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 89562306a36Sopenharmony_ci void *old; 89662306a36Sopenharmony_ci 89762306a36Sopenharmony_ci dax_disassociate_entry(entry, mapping, false); 89862306a36Sopenharmony_ci dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, 89962306a36Sopenharmony_ci shared); 90062306a36Sopenharmony_ci /* 90162306a36Sopenharmony_ci * Only swap our new entry into the page cache if the current 90262306a36Sopenharmony_ci * entry is a zero page or an empty entry. If a normal PTE or 90362306a36Sopenharmony_ci * PMD entry is already in the cache, we leave it alone. This 90462306a36Sopenharmony_ci * means that if we are trying to insert a PTE and the 90562306a36Sopenharmony_ci * existing entry is a PMD, we will just leave the PMD in the 90662306a36Sopenharmony_ci * tree and dirty it if necessary. 90762306a36Sopenharmony_ci */ 90862306a36Sopenharmony_ci old = dax_lock_entry(xas, new_entry); 90962306a36Sopenharmony_ci WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | 91062306a36Sopenharmony_ci DAX_LOCKED)); 91162306a36Sopenharmony_ci entry = new_entry; 91262306a36Sopenharmony_ci } else { 91362306a36Sopenharmony_ci xas_load(xas); /* Walk the xa_state */ 91462306a36Sopenharmony_ci } 91562306a36Sopenharmony_ci 91662306a36Sopenharmony_ci if (dirty) 91762306a36Sopenharmony_ci xas_set_mark(xas, PAGECACHE_TAG_DIRTY); 91862306a36Sopenharmony_ci 91962306a36Sopenharmony_ci if (write && shared) 92062306a36Sopenharmony_ci xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); 92162306a36Sopenharmony_ci 92262306a36Sopenharmony_ci xas_unlock_irq(xas); 92362306a36Sopenharmony_ci return entry; 92462306a36Sopenharmony_ci} 92562306a36Sopenharmony_ci 92662306a36Sopenharmony_cistatic int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, 92762306a36Sopenharmony_ci struct address_space *mapping, void *entry) 92862306a36Sopenharmony_ci{ 92962306a36Sopenharmony_ci unsigned long pfn, index, count, end; 93062306a36Sopenharmony_ci long ret = 0; 93162306a36Sopenharmony_ci struct vm_area_struct *vma; 93262306a36Sopenharmony_ci 93362306a36Sopenharmony_ci /* 93462306a36Sopenharmony_ci * A page got tagged dirty in DAX mapping? Something is seriously 93562306a36Sopenharmony_ci * wrong. 93662306a36Sopenharmony_ci */ 93762306a36Sopenharmony_ci if (WARN_ON(!xa_is_value(entry))) 93862306a36Sopenharmony_ci return -EIO; 93962306a36Sopenharmony_ci 94062306a36Sopenharmony_ci if (unlikely(dax_is_locked(entry))) { 94162306a36Sopenharmony_ci void *old_entry = entry; 94262306a36Sopenharmony_ci 94362306a36Sopenharmony_ci entry = get_unlocked_entry(xas, 0); 94462306a36Sopenharmony_ci 94562306a36Sopenharmony_ci /* Entry got punched out / reallocated? */ 94662306a36Sopenharmony_ci if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 94762306a36Sopenharmony_ci goto put_unlocked; 94862306a36Sopenharmony_ci /* 94962306a36Sopenharmony_ci * Entry got reallocated elsewhere? No need to writeback. 95062306a36Sopenharmony_ci * We have to compare pfns as we must not bail out due to 95162306a36Sopenharmony_ci * difference in lockbit or entry type. 95262306a36Sopenharmony_ci */ 95362306a36Sopenharmony_ci if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) 95462306a36Sopenharmony_ci goto put_unlocked; 95562306a36Sopenharmony_ci if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 95662306a36Sopenharmony_ci dax_is_zero_entry(entry))) { 95762306a36Sopenharmony_ci ret = -EIO; 95862306a36Sopenharmony_ci goto put_unlocked; 95962306a36Sopenharmony_ci } 96062306a36Sopenharmony_ci 96162306a36Sopenharmony_ci /* Another fsync thread may have already done this entry */ 96262306a36Sopenharmony_ci if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) 96362306a36Sopenharmony_ci goto put_unlocked; 96462306a36Sopenharmony_ci } 96562306a36Sopenharmony_ci 96662306a36Sopenharmony_ci /* Lock the entry to serialize with page faults */ 96762306a36Sopenharmony_ci dax_lock_entry(xas, entry); 96862306a36Sopenharmony_ci 96962306a36Sopenharmony_ci /* 97062306a36Sopenharmony_ci * We can clear the tag now but we have to be careful so that concurrent 97162306a36Sopenharmony_ci * dax_writeback_one() calls for the same index cannot finish before we 97262306a36Sopenharmony_ci * actually flush the caches. This is achieved as the calls will look 97362306a36Sopenharmony_ci * at the entry only under the i_pages lock and once they do that 97462306a36Sopenharmony_ci * they will see the entry locked and wait for it to unlock. 97562306a36Sopenharmony_ci */ 97662306a36Sopenharmony_ci xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); 97762306a36Sopenharmony_ci xas_unlock_irq(xas); 97862306a36Sopenharmony_ci 97962306a36Sopenharmony_ci /* 98062306a36Sopenharmony_ci * If dax_writeback_mapping_range() was given a wbc->range_start 98162306a36Sopenharmony_ci * in the middle of a PMD, the 'index' we use needs to be 98262306a36Sopenharmony_ci * aligned to the start of the PMD. 98362306a36Sopenharmony_ci * This allows us to flush for PMD_SIZE and not have to worry about 98462306a36Sopenharmony_ci * partial PMD writebacks. 98562306a36Sopenharmony_ci */ 98662306a36Sopenharmony_ci pfn = dax_to_pfn(entry); 98762306a36Sopenharmony_ci count = 1UL << dax_entry_order(entry); 98862306a36Sopenharmony_ci index = xas->xa_index & ~(count - 1); 98962306a36Sopenharmony_ci end = index + count - 1; 99062306a36Sopenharmony_ci 99162306a36Sopenharmony_ci /* Walk all mappings of a given index of a file and writeprotect them */ 99262306a36Sopenharmony_ci i_mmap_lock_read(mapping); 99362306a36Sopenharmony_ci vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) { 99462306a36Sopenharmony_ci pfn_mkclean_range(pfn, count, index, vma); 99562306a36Sopenharmony_ci cond_resched(); 99662306a36Sopenharmony_ci } 99762306a36Sopenharmony_ci i_mmap_unlock_read(mapping); 99862306a36Sopenharmony_ci 99962306a36Sopenharmony_ci dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); 100062306a36Sopenharmony_ci /* 100162306a36Sopenharmony_ci * After we have flushed the cache, we can clear the dirty tag. There 100262306a36Sopenharmony_ci * cannot be new dirty data in the pfn after the flush has completed as 100362306a36Sopenharmony_ci * the pfn mappings are writeprotected and fault waits for mapping 100462306a36Sopenharmony_ci * entry lock. 100562306a36Sopenharmony_ci */ 100662306a36Sopenharmony_ci xas_reset(xas); 100762306a36Sopenharmony_ci xas_lock_irq(xas); 100862306a36Sopenharmony_ci xas_store(xas, entry); 100962306a36Sopenharmony_ci xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); 101062306a36Sopenharmony_ci dax_wake_entry(xas, entry, WAKE_NEXT); 101162306a36Sopenharmony_ci 101262306a36Sopenharmony_ci trace_dax_writeback_one(mapping->host, index, count); 101362306a36Sopenharmony_ci return ret; 101462306a36Sopenharmony_ci 101562306a36Sopenharmony_ci put_unlocked: 101662306a36Sopenharmony_ci put_unlocked_entry(xas, entry, WAKE_NEXT); 101762306a36Sopenharmony_ci return ret; 101862306a36Sopenharmony_ci} 101962306a36Sopenharmony_ci 102062306a36Sopenharmony_ci/* 102162306a36Sopenharmony_ci * Flush the mapping to the persistent domain within the byte range of [start, 102262306a36Sopenharmony_ci * end]. This is required by data integrity operations to ensure file data is 102362306a36Sopenharmony_ci * on persistent storage prior to completion of the operation. 102462306a36Sopenharmony_ci */ 102562306a36Sopenharmony_ciint dax_writeback_mapping_range(struct address_space *mapping, 102662306a36Sopenharmony_ci struct dax_device *dax_dev, struct writeback_control *wbc) 102762306a36Sopenharmony_ci{ 102862306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); 102962306a36Sopenharmony_ci struct inode *inode = mapping->host; 103062306a36Sopenharmony_ci pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; 103162306a36Sopenharmony_ci void *entry; 103262306a36Sopenharmony_ci int ret = 0; 103362306a36Sopenharmony_ci unsigned int scanned = 0; 103462306a36Sopenharmony_ci 103562306a36Sopenharmony_ci if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 103662306a36Sopenharmony_ci return -EIO; 103762306a36Sopenharmony_ci 103862306a36Sopenharmony_ci if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL) 103962306a36Sopenharmony_ci return 0; 104062306a36Sopenharmony_ci 104162306a36Sopenharmony_ci trace_dax_writeback_range(inode, xas.xa_index, end_index); 104262306a36Sopenharmony_ci 104362306a36Sopenharmony_ci tag_pages_for_writeback(mapping, xas.xa_index, end_index); 104462306a36Sopenharmony_ci 104562306a36Sopenharmony_ci xas_lock_irq(&xas); 104662306a36Sopenharmony_ci xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { 104762306a36Sopenharmony_ci ret = dax_writeback_one(&xas, dax_dev, mapping, entry); 104862306a36Sopenharmony_ci if (ret < 0) { 104962306a36Sopenharmony_ci mapping_set_error(mapping, ret); 105062306a36Sopenharmony_ci break; 105162306a36Sopenharmony_ci } 105262306a36Sopenharmony_ci if (++scanned % XA_CHECK_SCHED) 105362306a36Sopenharmony_ci continue; 105462306a36Sopenharmony_ci 105562306a36Sopenharmony_ci xas_pause(&xas); 105662306a36Sopenharmony_ci xas_unlock_irq(&xas); 105762306a36Sopenharmony_ci cond_resched(); 105862306a36Sopenharmony_ci xas_lock_irq(&xas); 105962306a36Sopenharmony_ci } 106062306a36Sopenharmony_ci xas_unlock_irq(&xas); 106162306a36Sopenharmony_ci trace_dax_writeback_range_done(inode, xas.xa_index, end_index); 106262306a36Sopenharmony_ci return ret; 106362306a36Sopenharmony_ci} 106462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 106562306a36Sopenharmony_ci 106662306a36Sopenharmony_cistatic int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, 106762306a36Sopenharmony_ci size_t size, void **kaddr, pfn_t *pfnp) 106862306a36Sopenharmony_ci{ 106962306a36Sopenharmony_ci pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 107062306a36Sopenharmony_ci int id, rc = 0; 107162306a36Sopenharmony_ci long length; 107262306a36Sopenharmony_ci 107362306a36Sopenharmony_ci id = dax_read_lock(); 107462306a36Sopenharmony_ci length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), 107562306a36Sopenharmony_ci DAX_ACCESS, kaddr, pfnp); 107662306a36Sopenharmony_ci if (length < 0) { 107762306a36Sopenharmony_ci rc = length; 107862306a36Sopenharmony_ci goto out; 107962306a36Sopenharmony_ci } 108062306a36Sopenharmony_ci if (!pfnp) 108162306a36Sopenharmony_ci goto out_check_addr; 108262306a36Sopenharmony_ci rc = -EINVAL; 108362306a36Sopenharmony_ci if (PFN_PHYS(length) < size) 108462306a36Sopenharmony_ci goto out; 108562306a36Sopenharmony_ci if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) 108662306a36Sopenharmony_ci goto out; 108762306a36Sopenharmony_ci /* For larger pages we need devmap */ 108862306a36Sopenharmony_ci if (length > 1 && !pfn_t_devmap(*pfnp)) 108962306a36Sopenharmony_ci goto out; 109062306a36Sopenharmony_ci rc = 0; 109162306a36Sopenharmony_ci 109262306a36Sopenharmony_ciout_check_addr: 109362306a36Sopenharmony_ci if (!kaddr) 109462306a36Sopenharmony_ci goto out; 109562306a36Sopenharmony_ci if (!*kaddr) 109662306a36Sopenharmony_ci rc = -EFAULT; 109762306a36Sopenharmony_ciout: 109862306a36Sopenharmony_ci dax_read_unlock(id); 109962306a36Sopenharmony_ci return rc; 110062306a36Sopenharmony_ci} 110162306a36Sopenharmony_ci 110262306a36Sopenharmony_ci/** 110362306a36Sopenharmony_ci * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page 110462306a36Sopenharmony_ci * by copying the data before and after the range to be written. 110562306a36Sopenharmony_ci * @pos: address to do copy from. 110662306a36Sopenharmony_ci * @length: size of copy operation. 110762306a36Sopenharmony_ci * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) 110862306a36Sopenharmony_ci * @srcmap: iomap srcmap 110962306a36Sopenharmony_ci * @daddr: destination address to copy to. 111062306a36Sopenharmony_ci * 111162306a36Sopenharmony_ci * This can be called from two places. Either during DAX write fault (page 111262306a36Sopenharmony_ci * aligned), to copy the length size data to daddr. Or, while doing normal DAX 111362306a36Sopenharmony_ci * write operation, dax_iomap_iter() might call this to do the copy of either 111462306a36Sopenharmony_ci * start or end unaligned address. In the latter case the rest of the copy of 111562306a36Sopenharmony_ci * aligned ranges is taken care by dax_iomap_iter() itself. 111662306a36Sopenharmony_ci * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the 111762306a36Sopenharmony_ci * area to make sure no old data remains. 111862306a36Sopenharmony_ci */ 111962306a36Sopenharmony_cistatic int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, 112062306a36Sopenharmony_ci const struct iomap *srcmap, void *daddr) 112162306a36Sopenharmony_ci{ 112262306a36Sopenharmony_ci loff_t head_off = pos & (align_size - 1); 112362306a36Sopenharmony_ci size_t size = ALIGN(head_off + length, align_size); 112462306a36Sopenharmony_ci loff_t end = pos + length; 112562306a36Sopenharmony_ci loff_t pg_end = round_up(end, align_size); 112662306a36Sopenharmony_ci /* copy_all is usually in page fault case */ 112762306a36Sopenharmony_ci bool copy_all = head_off == 0 && end == pg_end; 112862306a36Sopenharmony_ci /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ 112962306a36Sopenharmony_ci bool zero_edge = srcmap->flags & IOMAP_F_SHARED || 113062306a36Sopenharmony_ci srcmap->type == IOMAP_UNWRITTEN; 113162306a36Sopenharmony_ci void *saddr = 0; 113262306a36Sopenharmony_ci int ret = 0; 113362306a36Sopenharmony_ci 113462306a36Sopenharmony_ci if (!zero_edge) { 113562306a36Sopenharmony_ci ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); 113662306a36Sopenharmony_ci if (ret) 113762306a36Sopenharmony_ci return dax_mem2blk_err(ret); 113862306a36Sopenharmony_ci } 113962306a36Sopenharmony_ci 114062306a36Sopenharmony_ci if (copy_all) { 114162306a36Sopenharmony_ci if (zero_edge) 114262306a36Sopenharmony_ci memset(daddr, 0, size); 114362306a36Sopenharmony_ci else 114462306a36Sopenharmony_ci ret = copy_mc_to_kernel(daddr, saddr, length); 114562306a36Sopenharmony_ci goto out; 114662306a36Sopenharmony_ci } 114762306a36Sopenharmony_ci 114862306a36Sopenharmony_ci /* Copy the head part of the range */ 114962306a36Sopenharmony_ci if (head_off) { 115062306a36Sopenharmony_ci if (zero_edge) 115162306a36Sopenharmony_ci memset(daddr, 0, head_off); 115262306a36Sopenharmony_ci else { 115362306a36Sopenharmony_ci ret = copy_mc_to_kernel(daddr, saddr, head_off); 115462306a36Sopenharmony_ci if (ret) 115562306a36Sopenharmony_ci return -EIO; 115662306a36Sopenharmony_ci } 115762306a36Sopenharmony_ci } 115862306a36Sopenharmony_ci 115962306a36Sopenharmony_ci /* Copy the tail part of the range */ 116062306a36Sopenharmony_ci if (end < pg_end) { 116162306a36Sopenharmony_ci loff_t tail_off = head_off + length; 116262306a36Sopenharmony_ci loff_t tail_len = pg_end - end; 116362306a36Sopenharmony_ci 116462306a36Sopenharmony_ci if (zero_edge) 116562306a36Sopenharmony_ci memset(daddr + tail_off, 0, tail_len); 116662306a36Sopenharmony_ci else { 116762306a36Sopenharmony_ci ret = copy_mc_to_kernel(daddr + tail_off, 116862306a36Sopenharmony_ci saddr + tail_off, tail_len); 116962306a36Sopenharmony_ci if (ret) 117062306a36Sopenharmony_ci return -EIO; 117162306a36Sopenharmony_ci } 117262306a36Sopenharmony_ci } 117362306a36Sopenharmony_ciout: 117462306a36Sopenharmony_ci if (zero_edge) 117562306a36Sopenharmony_ci dax_flush(srcmap->dax_dev, daddr, size); 117662306a36Sopenharmony_ci return ret ? -EIO : 0; 117762306a36Sopenharmony_ci} 117862306a36Sopenharmony_ci 117962306a36Sopenharmony_ci/* 118062306a36Sopenharmony_ci * The user has performed a load from a hole in the file. Allocating a new 118162306a36Sopenharmony_ci * page in the file would cause excessive storage usage for workloads with 118262306a36Sopenharmony_ci * sparse files. Instead we insert a read-only mapping of the 4k zero page. 118362306a36Sopenharmony_ci * If this page is ever written to we will re-fault and change the mapping to 118462306a36Sopenharmony_ci * point to real DAX storage instead. 118562306a36Sopenharmony_ci */ 118662306a36Sopenharmony_cistatic vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, 118762306a36Sopenharmony_ci const struct iomap_iter *iter, void **entry) 118862306a36Sopenharmony_ci{ 118962306a36Sopenharmony_ci struct inode *inode = iter->inode; 119062306a36Sopenharmony_ci unsigned long vaddr = vmf->address; 119162306a36Sopenharmony_ci pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); 119262306a36Sopenharmony_ci vm_fault_t ret; 119362306a36Sopenharmony_ci 119462306a36Sopenharmony_ci *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); 119562306a36Sopenharmony_ci 119662306a36Sopenharmony_ci ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); 119762306a36Sopenharmony_ci trace_dax_load_hole(inode, vmf, ret); 119862306a36Sopenharmony_ci return ret; 119962306a36Sopenharmony_ci} 120062306a36Sopenharmony_ci 120162306a36Sopenharmony_ci#ifdef CONFIG_FS_DAX_PMD 120262306a36Sopenharmony_cistatic vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, 120362306a36Sopenharmony_ci const struct iomap_iter *iter, void **entry) 120462306a36Sopenharmony_ci{ 120562306a36Sopenharmony_ci struct address_space *mapping = vmf->vma->vm_file->f_mapping; 120662306a36Sopenharmony_ci unsigned long pmd_addr = vmf->address & PMD_MASK; 120762306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 120862306a36Sopenharmony_ci struct inode *inode = mapping->host; 120962306a36Sopenharmony_ci pgtable_t pgtable = NULL; 121062306a36Sopenharmony_ci struct page *zero_page; 121162306a36Sopenharmony_ci spinlock_t *ptl; 121262306a36Sopenharmony_ci pmd_t pmd_entry; 121362306a36Sopenharmony_ci pfn_t pfn; 121462306a36Sopenharmony_ci 121562306a36Sopenharmony_ci zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); 121662306a36Sopenharmony_ci 121762306a36Sopenharmony_ci if (unlikely(!zero_page)) 121862306a36Sopenharmony_ci goto fallback; 121962306a36Sopenharmony_ci 122062306a36Sopenharmony_ci pfn = page_to_pfn_t(zero_page); 122162306a36Sopenharmony_ci *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, 122262306a36Sopenharmony_ci DAX_PMD | DAX_ZERO_PAGE); 122362306a36Sopenharmony_ci 122462306a36Sopenharmony_ci if (arch_needs_pgtable_deposit()) { 122562306a36Sopenharmony_ci pgtable = pte_alloc_one(vma->vm_mm); 122662306a36Sopenharmony_ci if (!pgtable) 122762306a36Sopenharmony_ci return VM_FAULT_OOM; 122862306a36Sopenharmony_ci } 122962306a36Sopenharmony_ci 123062306a36Sopenharmony_ci ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 123162306a36Sopenharmony_ci if (!pmd_none(*(vmf->pmd))) { 123262306a36Sopenharmony_ci spin_unlock(ptl); 123362306a36Sopenharmony_ci goto fallback; 123462306a36Sopenharmony_ci } 123562306a36Sopenharmony_ci 123662306a36Sopenharmony_ci if (pgtable) { 123762306a36Sopenharmony_ci pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 123862306a36Sopenharmony_ci mm_inc_nr_ptes(vma->vm_mm); 123962306a36Sopenharmony_ci } 124062306a36Sopenharmony_ci pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); 124162306a36Sopenharmony_ci pmd_entry = pmd_mkhuge(pmd_entry); 124262306a36Sopenharmony_ci set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); 124362306a36Sopenharmony_ci spin_unlock(ptl); 124462306a36Sopenharmony_ci trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); 124562306a36Sopenharmony_ci return VM_FAULT_NOPAGE; 124662306a36Sopenharmony_ci 124762306a36Sopenharmony_cifallback: 124862306a36Sopenharmony_ci if (pgtable) 124962306a36Sopenharmony_ci pte_free(vma->vm_mm, pgtable); 125062306a36Sopenharmony_ci trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); 125162306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 125262306a36Sopenharmony_ci} 125362306a36Sopenharmony_ci#else 125462306a36Sopenharmony_cistatic vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, 125562306a36Sopenharmony_ci const struct iomap_iter *iter, void **entry) 125662306a36Sopenharmony_ci{ 125762306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 125862306a36Sopenharmony_ci} 125962306a36Sopenharmony_ci#endif /* CONFIG_FS_DAX_PMD */ 126062306a36Sopenharmony_ci 126162306a36Sopenharmony_cistatic s64 dax_unshare_iter(struct iomap_iter *iter) 126262306a36Sopenharmony_ci{ 126362306a36Sopenharmony_ci struct iomap *iomap = &iter->iomap; 126462306a36Sopenharmony_ci const struct iomap *srcmap = iomap_iter_srcmap(iter); 126562306a36Sopenharmony_ci loff_t pos = iter->pos; 126662306a36Sopenharmony_ci loff_t length = iomap_length(iter); 126762306a36Sopenharmony_ci int id = 0; 126862306a36Sopenharmony_ci s64 ret = 0; 126962306a36Sopenharmony_ci void *daddr = NULL, *saddr = NULL; 127062306a36Sopenharmony_ci 127162306a36Sopenharmony_ci /* don't bother with blocks that are not shared to start with */ 127262306a36Sopenharmony_ci if (!(iomap->flags & IOMAP_F_SHARED)) 127362306a36Sopenharmony_ci return length; 127462306a36Sopenharmony_ci 127562306a36Sopenharmony_ci id = dax_read_lock(); 127662306a36Sopenharmony_ci ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); 127762306a36Sopenharmony_ci if (ret < 0) 127862306a36Sopenharmony_ci goto out_unlock; 127962306a36Sopenharmony_ci 128062306a36Sopenharmony_ci /* zero the distance if srcmap is HOLE or UNWRITTEN */ 128162306a36Sopenharmony_ci if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) { 128262306a36Sopenharmony_ci memset(daddr, 0, length); 128362306a36Sopenharmony_ci dax_flush(iomap->dax_dev, daddr, length); 128462306a36Sopenharmony_ci ret = length; 128562306a36Sopenharmony_ci goto out_unlock; 128662306a36Sopenharmony_ci } 128762306a36Sopenharmony_ci 128862306a36Sopenharmony_ci ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); 128962306a36Sopenharmony_ci if (ret < 0) 129062306a36Sopenharmony_ci goto out_unlock; 129162306a36Sopenharmony_ci 129262306a36Sopenharmony_ci if (copy_mc_to_kernel(daddr, saddr, length) == 0) 129362306a36Sopenharmony_ci ret = length; 129462306a36Sopenharmony_ci else 129562306a36Sopenharmony_ci ret = -EIO; 129662306a36Sopenharmony_ci 129762306a36Sopenharmony_ciout_unlock: 129862306a36Sopenharmony_ci dax_read_unlock(id); 129962306a36Sopenharmony_ci return dax_mem2blk_err(ret); 130062306a36Sopenharmony_ci} 130162306a36Sopenharmony_ci 130262306a36Sopenharmony_ciint dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, 130362306a36Sopenharmony_ci const struct iomap_ops *ops) 130462306a36Sopenharmony_ci{ 130562306a36Sopenharmony_ci struct iomap_iter iter = { 130662306a36Sopenharmony_ci .inode = inode, 130762306a36Sopenharmony_ci .pos = pos, 130862306a36Sopenharmony_ci .len = len, 130962306a36Sopenharmony_ci .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, 131062306a36Sopenharmony_ci }; 131162306a36Sopenharmony_ci int ret; 131262306a36Sopenharmony_ci 131362306a36Sopenharmony_ci while ((ret = iomap_iter(&iter, ops)) > 0) 131462306a36Sopenharmony_ci iter.processed = dax_unshare_iter(&iter); 131562306a36Sopenharmony_ci return ret; 131662306a36Sopenharmony_ci} 131762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_file_unshare); 131862306a36Sopenharmony_ci 131962306a36Sopenharmony_cistatic int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) 132062306a36Sopenharmony_ci{ 132162306a36Sopenharmony_ci const struct iomap *iomap = &iter->iomap; 132262306a36Sopenharmony_ci const struct iomap *srcmap = iomap_iter_srcmap(iter); 132362306a36Sopenharmony_ci unsigned offset = offset_in_page(pos); 132462306a36Sopenharmony_ci pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 132562306a36Sopenharmony_ci void *kaddr; 132662306a36Sopenharmony_ci long ret; 132762306a36Sopenharmony_ci 132862306a36Sopenharmony_ci ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, 132962306a36Sopenharmony_ci NULL); 133062306a36Sopenharmony_ci if (ret < 0) 133162306a36Sopenharmony_ci return dax_mem2blk_err(ret); 133262306a36Sopenharmony_ci 133362306a36Sopenharmony_ci memset(kaddr + offset, 0, size); 133462306a36Sopenharmony_ci if (iomap->flags & IOMAP_F_SHARED) 133562306a36Sopenharmony_ci ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, 133662306a36Sopenharmony_ci kaddr); 133762306a36Sopenharmony_ci else 133862306a36Sopenharmony_ci dax_flush(iomap->dax_dev, kaddr + offset, size); 133962306a36Sopenharmony_ci return ret; 134062306a36Sopenharmony_ci} 134162306a36Sopenharmony_ci 134262306a36Sopenharmony_cistatic s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) 134362306a36Sopenharmony_ci{ 134462306a36Sopenharmony_ci const struct iomap *iomap = &iter->iomap; 134562306a36Sopenharmony_ci const struct iomap *srcmap = iomap_iter_srcmap(iter); 134662306a36Sopenharmony_ci loff_t pos = iter->pos; 134762306a36Sopenharmony_ci u64 length = iomap_length(iter); 134862306a36Sopenharmony_ci s64 written = 0; 134962306a36Sopenharmony_ci 135062306a36Sopenharmony_ci /* already zeroed? we're done. */ 135162306a36Sopenharmony_ci if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 135262306a36Sopenharmony_ci return length; 135362306a36Sopenharmony_ci 135462306a36Sopenharmony_ci /* 135562306a36Sopenharmony_ci * invalidate the pages whose sharing state is to be changed 135662306a36Sopenharmony_ci * because of CoW. 135762306a36Sopenharmony_ci */ 135862306a36Sopenharmony_ci if (iomap->flags & IOMAP_F_SHARED) 135962306a36Sopenharmony_ci invalidate_inode_pages2_range(iter->inode->i_mapping, 136062306a36Sopenharmony_ci pos >> PAGE_SHIFT, 136162306a36Sopenharmony_ci (pos + length - 1) >> PAGE_SHIFT); 136262306a36Sopenharmony_ci 136362306a36Sopenharmony_ci do { 136462306a36Sopenharmony_ci unsigned offset = offset_in_page(pos); 136562306a36Sopenharmony_ci unsigned size = min_t(u64, PAGE_SIZE - offset, length); 136662306a36Sopenharmony_ci pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 136762306a36Sopenharmony_ci long rc; 136862306a36Sopenharmony_ci int id; 136962306a36Sopenharmony_ci 137062306a36Sopenharmony_ci id = dax_read_lock(); 137162306a36Sopenharmony_ci if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) 137262306a36Sopenharmony_ci rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); 137362306a36Sopenharmony_ci else 137462306a36Sopenharmony_ci rc = dax_memzero(iter, pos, size); 137562306a36Sopenharmony_ci dax_read_unlock(id); 137662306a36Sopenharmony_ci 137762306a36Sopenharmony_ci if (rc < 0) 137862306a36Sopenharmony_ci return rc; 137962306a36Sopenharmony_ci pos += size; 138062306a36Sopenharmony_ci length -= size; 138162306a36Sopenharmony_ci written += size; 138262306a36Sopenharmony_ci } while (length > 0); 138362306a36Sopenharmony_ci 138462306a36Sopenharmony_ci if (did_zero) 138562306a36Sopenharmony_ci *did_zero = true; 138662306a36Sopenharmony_ci return written; 138762306a36Sopenharmony_ci} 138862306a36Sopenharmony_ci 138962306a36Sopenharmony_ciint dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 139062306a36Sopenharmony_ci const struct iomap_ops *ops) 139162306a36Sopenharmony_ci{ 139262306a36Sopenharmony_ci struct iomap_iter iter = { 139362306a36Sopenharmony_ci .inode = inode, 139462306a36Sopenharmony_ci .pos = pos, 139562306a36Sopenharmony_ci .len = len, 139662306a36Sopenharmony_ci .flags = IOMAP_DAX | IOMAP_ZERO, 139762306a36Sopenharmony_ci }; 139862306a36Sopenharmony_ci int ret; 139962306a36Sopenharmony_ci 140062306a36Sopenharmony_ci while ((ret = iomap_iter(&iter, ops)) > 0) 140162306a36Sopenharmony_ci iter.processed = dax_zero_iter(&iter, did_zero); 140262306a36Sopenharmony_ci return ret; 140362306a36Sopenharmony_ci} 140462306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_zero_range); 140562306a36Sopenharmony_ci 140662306a36Sopenharmony_ciint dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 140762306a36Sopenharmony_ci const struct iomap_ops *ops) 140862306a36Sopenharmony_ci{ 140962306a36Sopenharmony_ci unsigned int blocksize = i_blocksize(inode); 141062306a36Sopenharmony_ci unsigned int off = pos & (blocksize - 1); 141162306a36Sopenharmony_ci 141262306a36Sopenharmony_ci /* Block boundary? Nothing to do */ 141362306a36Sopenharmony_ci if (!off) 141462306a36Sopenharmony_ci return 0; 141562306a36Sopenharmony_ci return dax_zero_range(inode, pos, blocksize - off, did_zero, ops); 141662306a36Sopenharmony_ci} 141762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_truncate_page); 141862306a36Sopenharmony_ci 141962306a36Sopenharmony_cistatic loff_t dax_iomap_iter(const struct iomap_iter *iomi, 142062306a36Sopenharmony_ci struct iov_iter *iter) 142162306a36Sopenharmony_ci{ 142262306a36Sopenharmony_ci const struct iomap *iomap = &iomi->iomap; 142362306a36Sopenharmony_ci const struct iomap *srcmap = iomap_iter_srcmap(iomi); 142462306a36Sopenharmony_ci loff_t length = iomap_length(iomi); 142562306a36Sopenharmony_ci loff_t pos = iomi->pos; 142662306a36Sopenharmony_ci struct dax_device *dax_dev = iomap->dax_dev; 142762306a36Sopenharmony_ci loff_t end = pos + length, done = 0; 142862306a36Sopenharmony_ci bool write = iov_iter_rw(iter) == WRITE; 142962306a36Sopenharmony_ci bool cow = write && iomap->flags & IOMAP_F_SHARED; 143062306a36Sopenharmony_ci ssize_t ret = 0; 143162306a36Sopenharmony_ci size_t xfer; 143262306a36Sopenharmony_ci int id; 143362306a36Sopenharmony_ci 143462306a36Sopenharmony_ci if (!write) { 143562306a36Sopenharmony_ci end = min(end, i_size_read(iomi->inode)); 143662306a36Sopenharmony_ci if (pos >= end) 143762306a36Sopenharmony_ci return 0; 143862306a36Sopenharmony_ci 143962306a36Sopenharmony_ci if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 144062306a36Sopenharmony_ci return iov_iter_zero(min(length, end - pos), iter); 144162306a36Sopenharmony_ci } 144262306a36Sopenharmony_ci 144362306a36Sopenharmony_ci /* 144462306a36Sopenharmony_ci * In DAX mode, enforce either pure overwrites of written extents, or 144562306a36Sopenharmony_ci * writes to unwritten extents as part of a copy-on-write operation. 144662306a36Sopenharmony_ci */ 144762306a36Sopenharmony_ci if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED && 144862306a36Sopenharmony_ci !(iomap->flags & IOMAP_F_SHARED))) 144962306a36Sopenharmony_ci return -EIO; 145062306a36Sopenharmony_ci 145162306a36Sopenharmony_ci /* 145262306a36Sopenharmony_ci * Write can allocate block for an area which has a hole page mapped 145362306a36Sopenharmony_ci * into page tables. We have to tear down these mappings so that data 145462306a36Sopenharmony_ci * written by write(2) is visible in mmap. 145562306a36Sopenharmony_ci */ 145662306a36Sopenharmony_ci if (iomap->flags & IOMAP_F_NEW || cow) { 145762306a36Sopenharmony_ci /* 145862306a36Sopenharmony_ci * Filesystem allows CoW on non-shared extents. The src extents 145962306a36Sopenharmony_ci * may have been mmapped with dirty mark before. To be able to 146062306a36Sopenharmony_ci * invalidate its dax entries, we need to clear the dirty mark 146162306a36Sopenharmony_ci * in advance. 146262306a36Sopenharmony_ci */ 146362306a36Sopenharmony_ci if (cow) 146462306a36Sopenharmony_ci __dax_clear_dirty_range(iomi->inode->i_mapping, 146562306a36Sopenharmony_ci pos >> PAGE_SHIFT, 146662306a36Sopenharmony_ci (end - 1) >> PAGE_SHIFT); 146762306a36Sopenharmony_ci invalidate_inode_pages2_range(iomi->inode->i_mapping, 146862306a36Sopenharmony_ci pos >> PAGE_SHIFT, 146962306a36Sopenharmony_ci (end - 1) >> PAGE_SHIFT); 147062306a36Sopenharmony_ci } 147162306a36Sopenharmony_ci 147262306a36Sopenharmony_ci id = dax_read_lock(); 147362306a36Sopenharmony_ci while (pos < end) { 147462306a36Sopenharmony_ci unsigned offset = pos & (PAGE_SIZE - 1); 147562306a36Sopenharmony_ci const size_t size = ALIGN(length + offset, PAGE_SIZE); 147662306a36Sopenharmony_ci pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 147762306a36Sopenharmony_ci ssize_t map_len; 147862306a36Sopenharmony_ci bool recovery = false; 147962306a36Sopenharmony_ci void *kaddr; 148062306a36Sopenharmony_ci 148162306a36Sopenharmony_ci if (fatal_signal_pending(current)) { 148262306a36Sopenharmony_ci ret = -EINTR; 148362306a36Sopenharmony_ci break; 148462306a36Sopenharmony_ci } 148562306a36Sopenharmony_ci 148662306a36Sopenharmony_ci map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), 148762306a36Sopenharmony_ci DAX_ACCESS, &kaddr, NULL); 148862306a36Sopenharmony_ci if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) { 148962306a36Sopenharmony_ci map_len = dax_direct_access(dax_dev, pgoff, 149062306a36Sopenharmony_ci PHYS_PFN(size), DAX_RECOVERY_WRITE, 149162306a36Sopenharmony_ci &kaddr, NULL); 149262306a36Sopenharmony_ci if (map_len > 0) 149362306a36Sopenharmony_ci recovery = true; 149462306a36Sopenharmony_ci } 149562306a36Sopenharmony_ci if (map_len < 0) { 149662306a36Sopenharmony_ci ret = dax_mem2blk_err(map_len); 149762306a36Sopenharmony_ci break; 149862306a36Sopenharmony_ci } 149962306a36Sopenharmony_ci 150062306a36Sopenharmony_ci if (cow) { 150162306a36Sopenharmony_ci ret = dax_iomap_copy_around(pos, length, PAGE_SIZE, 150262306a36Sopenharmony_ci srcmap, kaddr); 150362306a36Sopenharmony_ci if (ret) 150462306a36Sopenharmony_ci break; 150562306a36Sopenharmony_ci } 150662306a36Sopenharmony_ci 150762306a36Sopenharmony_ci map_len = PFN_PHYS(map_len); 150862306a36Sopenharmony_ci kaddr += offset; 150962306a36Sopenharmony_ci map_len -= offset; 151062306a36Sopenharmony_ci if (map_len > end - pos) 151162306a36Sopenharmony_ci map_len = end - pos; 151262306a36Sopenharmony_ci 151362306a36Sopenharmony_ci if (recovery) 151462306a36Sopenharmony_ci xfer = dax_recovery_write(dax_dev, pgoff, kaddr, 151562306a36Sopenharmony_ci map_len, iter); 151662306a36Sopenharmony_ci else if (write) 151762306a36Sopenharmony_ci xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, 151862306a36Sopenharmony_ci map_len, iter); 151962306a36Sopenharmony_ci else 152062306a36Sopenharmony_ci xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, 152162306a36Sopenharmony_ci map_len, iter); 152262306a36Sopenharmony_ci 152362306a36Sopenharmony_ci pos += xfer; 152462306a36Sopenharmony_ci length -= xfer; 152562306a36Sopenharmony_ci done += xfer; 152662306a36Sopenharmony_ci 152762306a36Sopenharmony_ci if (xfer == 0) 152862306a36Sopenharmony_ci ret = -EFAULT; 152962306a36Sopenharmony_ci if (xfer < map_len) 153062306a36Sopenharmony_ci break; 153162306a36Sopenharmony_ci } 153262306a36Sopenharmony_ci dax_read_unlock(id); 153362306a36Sopenharmony_ci 153462306a36Sopenharmony_ci return done ? done : ret; 153562306a36Sopenharmony_ci} 153662306a36Sopenharmony_ci 153762306a36Sopenharmony_ci/** 153862306a36Sopenharmony_ci * dax_iomap_rw - Perform I/O to a DAX file 153962306a36Sopenharmony_ci * @iocb: The control block for this I/O 154062306a36Sopenharmony_ci * @iter: The addresses to do I/O from or to 154162306a36Sopenharmony_ci * @ops: iomap ops passed from the file system 154262306a36Sopenharmony_ci * 154362306a36Sopenharmony_ci * This function performs read and write operations to directly mapped 154462306a36Sopenharmony_ci * persistent memory. The callers needs to take care of read/write exclusion 154562306a36Sopenharmony_ci * and evicting any page cache pages in the region under I/O. 154662306a36Sopenharmony_ci */ 154762306a36Sopenharmony_cissize_t 154862306a36Sopenharmony_cidax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 154962306a36Sopenharmony_ci const struct iomap_ops *ops) 155062306a36Sopenharmony_ci{ 155162306a36Sopenharmony_ci struct iomap_iter iomi = { 155262306a36Sopenharmony_ci .inode = iocb->ki_filp->f_mapping->host, 155362306a36Sopenharmony_ci .pos = iocb->ki_pos, 155462306a36Sopenharmony_ci .len = iov_iter_count(iter), 155562306a36Sopenharmony_ci .flags = IOMAP_DAX, 155662306a36Sopenharmony_ci }; 155762306a36Sopenharmony_ci loff_t done = 0; 155862306a36Sopenharmony_ci int ret; 155962306a36Sopenharmony_ci 156062306a36Sopenharmony_ci if (!iomi.len) 156162306a36Sopenharmony_ci return 0; 156262306a36Sopenharmony_ci 156362306a36Sopenharmony_ci if (iov_iter_rw(iter) == WRITE) { 156462306a36Sopenharmony_ci lockdep_assert_held_write(&iomi.inode->i_rwsem); 156562306a36Sopenharmony_ci iomi.flags |= IOMAP_WRITE; 156662306a36Sopenharmony_ci } else { 156762306a36Sopenharmony_ci lockdep_assert_held(&iomi.inode->i_rwsem); 156862306a36Sopenharmony_ci } 156962306a36Sopenharmony_ci 157062306a36Sopenharmony_ci if (iocb->ki_flags & IOCB_NOWAIT) 157162306a36Sopenharmony_ci iomi.flags |= IOMAP_NOWAIT; 157262306a36Sopenharmony_ci 157362306a36Sopenharmony_ci while ((ret = iomap_iter(&iomi, ops)) > 0) 157462306a36Sopenharmony_ci iomi.processed = dax_iomap_iter(&iomi, iter); 157562306a36Sopenharmony_ci 157662306a36Sopenharmony_ci done = iomi.pos - iocb->ki_pos; 157762306a36Sopenharmony_ci iocb->ki_pos = iomi.pos; 157862306a36Sopenharmony_ci return done ? done : ret; 157962306a36Sopenharmony_ci} 158062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_iomap_rw); 158162306a36Sopenharmony_ci 158262306a36Sopenharmony_cistatic vm_fault_t dax_fault_return(int error) 158362306a36Sopenharmony_ci{ 158462306a36Sopenharmony_ci if (error == 0) 158562306a36Sopenharmony_ci return VM_FAULT_NOPAGE; 158662306a36Sopenharmony_ci return vmf_error(error); 158762306a36Sopenharmony_ci} 158862306a36Sopenharmony_ci 158962306a36Sopenharmony_ci/* 159062306a36Sopenharmony_ci * When handling a synchronous page fault and the inode need a fsync, we can 159162306a36Sopenharmony_ci * insert the PTE/PMD into page tables only after that fsync happened. Skip 159262306a36Sopenharmony_ci * insertion for now and return the pfn so that caller can insert it after the 159362306a36Sopenharmony_ci * fsync is done. 159462306a36Sopenharmony_ci */ 159562306a36Sopenharmony_cistatic vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) 159662306a36Sopenharmony_ci{ 159762306a36Sopenharmony_ci if (WARN_ON_ONCE(!pfnp)) 159862306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 159962306a36Sopenharmony_ci *pfnp = pfn; 160062306a36Sopenharmony_ci return VM_FAULT_NEEDDSYNC; 160162306a36Sopenharmony_ci} 160262306a36Sopenharmony_ci 160362306a36Sopenharmony_cistatic vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, 160462306a36Sopenharmony_ci const struct iomap_iter *iter) 160562306a36Sopenharmony_ci{ 160662306a36Sopenharmony_ci vm_fault_t ret; 160762306a36Sopenharmony_ci int error = 0; 160862306a36Sopenharmony_ci 160962306a36Sopenharmony_ci switch (iter->iomap.type) { 161062306a36Sopenharmony_ci case IOMAP_HOLE: 161162306a36Sopenharmony_ci case IOMAP_UNWRITTEN: 161262306a36Sopenharmony_ci clear_user_highpage(vmf->cow_page, vmf->address); 161362306a36Sopenharmony_ci break; 161462306a36Sopenharmony_ci case IOMAP_MAPPED: 161562306a36Sopenharmony_ci error = copy_cow_page_dax(vmf, iter); 161662306a36Sopenharmony_ci break; 161762306a36Sopenharmony_ci default: 161862306a36Sopenharmony_ci WARN_ON_ONCE(1); 161962306a36Sopenharmony_ci error = -EIO; 162062306a36Sopenharmony_ci break; 162162306a36Sopenharmony_ci } 162262306a36Sopenharmony_ci 162362306a36Sopenharmony_ci if (error) 162462306a36Sopenharmony_ci return dax_fault_return(error); 162562306a36Sopenharmony_ci 162662306a36Sopenharmony_ci __SetPageUptodate(vmf->cow_page); 162762306a36Sopenharmony_ci ret = finish_fault(vmf); 162862306a36Sopenharmony_ci if (!ret) 162962306a36Sopenharmony_ci return VM_FAULT_DONE_COW; 163062306a36Sopenharmony_ci return ret; 163162306a36Sopenharmony_ci} 163262306a36Sopenharmony_ci 163362306a36Sopenharmony_ci/** 163462306a36Sopenharmony_ci * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault. 163562306a36Sopenharmony_ci * @vmf: vm fault instance 163662306a36Sopenharmony_ci * @iter: iomap iter 163762306a36Sopenharmony_ci * @pfnp: pfn to be returned 163862306a36Sopenharmony_ci * @xas: the dax mapping tree of a file 163962306a36Sopenharmony_ci * @entry: an unlocked dax entry to be inserted 164062306a36Sopenharmony_ci * @pmd: distinguish whether it is a pmd fault 164162306a36Sopenharmony_ci */ 164262306a36Sopenharmony_cistatic vm_fault_t dax_fault_iter(struct vm_fault *vmf, 164362306a36Sopenharmony_ci const struct iomap_iter *iter, pfn_t *pfnp, 164462306a36Sopenharmony_ci struct xa_state *xas, void **entry, bool pmd) 164562306a36Sopenharmony_ci{ 164662306a36Sopenharmony_ci const struct iomap *iomap = &iter->iomap; 164762306a36Sopenharmony_ci const struct iomap *srcmap = iomap_iter_srcmap(iter); 164862306a36Sopenharmony_ci size_t size = pmd ? PMD_SIZE : PAGE_SIZE; 164962306a36Sopenharmony_ci loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; 165062306a36Sopenharmony_ci bool write = iter->flags & IOMAP_WRITE; 165162306a36Sopenharmony_ci unsigned long entry_flags = pmd ? DAX_PMD : 0; 165262306a36Sopenharmony_ci int err = 0; 165362306a36Sopenharmony_ci pfn_t pfn; 165462306a36Sopenharmony_ci void *kaddr; 165562306a36Sopenharmony_ci 165662306a36Sopenharmony_ci if (!pmd && vmf->cow_page) 165762306a36Sopenharmony_ci return dax_fault_cow_page(vmf, iter); 165862306a36Sopenharmony_ci 165962306a36Sopenharmony_ci /* if we are reading UNWRITTEN and HOLE, return a hole. */ 166062306a36Sopenharmony_ci if (!write && 166162306a36Sopenharmony_ci (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { 166262306a36Sopenharmony_ci if (!pmd) 166362306a36Sopenharmony_ci return dax_load_hole(xas, vmf, iter, entry); 166462306a36Sopenharmony_ci return dax_pmd_load_hole(xas, vmf, iter, entry); 166562306a36Sopenharmony_ci } 166662306a36Sopenharmony_ci 166762306a36Sopenharmony_ci if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) { 166862306a36Sopenharmony_ci WARN_ON_ONCE(1); 166962306a36Sopenharmony_ci return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; 167062306a36Sopenharmony_ci } 167162306a36Sopenharmony_ci 167262306a36Sopenharmony_ci err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn); 167362306a36Sopenharmony_ci if (err) 167462306a36Sopenharmony_ci return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); 167562306a36Sopenharmony_ci 167662306a36Sopenharmony_ci *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); 167762306a36Sopenharmony_ci 167862306a36Sopenharmony_ci if (write && iomap->flags & IOMAP_F_SHARED) { 167962306a36Sopenharmony_ci err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); 168062306a36Sopenharmony_ci if (err) 168162306a36Sopenharmony_ci return dax_fault_return(err); 168262306a36Sopenharmony_ci } 168362306a36Sopenharmony_ci 168462306a36Sopenharmony_ci if (dax_fault_is_synchronous(iter, vmf->vma)) 168562306a36Sopenharmony_ci return dax_fault_synchronous_pfnp(pfnp, pfn); 168662306a36Sopenharmony_ci 168762306a36Sopenharmony_ci /* insert PMD pfn */ 168862306a36Sopenharmony_ci if (pmd) 168962306a36Sopenharmony_ci return vmf_insert_pfn_pmd(vmf, pfn, write); 169062306a36Sopenharmony_ci 169162306a36Sopenharmony_ci /* insert PTE pfn */ 169262306a36Sopenharmony_ci if (write) 169362306a36Sopenharmony_ci return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); 169462306a36Sopenharmony_ci return vmf_insert_mixed(vmf->vma, vmf->address, pfn); 169562306a36Sopenharmony_ci} 169662306a36Sopenharmony_ci 169762306a36Sopenharmony_cistatic vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, 169862306a36Sopenharmony_ci int *iomap_errp, const struct iomap_ops *ops) 169962306a36Sopenharmony_ci{ 170062306a36Sopenharmony_ci struct address_space *mapping = vmf->vma->vm_file->f_mapping; 170162306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, vmf->pgoff); 170262306a36Sopenharmony_ci struct iomap_iter iter = { 170362306a36Sopenharmony_ci .inode = mapping->host, 170462306a36Sopenharmony_ci .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, 170562306a36Sopenharmony_ci .len = PAGE_SIZE, 170662306a36Sopenharmony_ci .flags = IOMAP_DAX | IOMAP_FAULT, 170762306a36Sopenharmony_ci }; 170862306a36Sopenharmony_ci vm_fault_t ret = 0; 170962306a36Sopenharmony_ci void *entry; 171062306a36Sopenharmony_ci int error; 171162306a36Sopenharmony_ci 171262306a36Sopenharmony_ci trace_dax_pte_fault(iter.inode, vmf, ret); 171362306a36Sopenharmony_ci /* 171462306a36Sopenharmony_ci * Check whether offset isn't beyond end of file now. Caller is supposed 171562306a36Sopenharmony_ci * to hold locks serializing us with truncate / punch hole so this is 171662306a36Sopenharmony_ci * a reliable test. 171762306a36Sopenharmony_ci */ 171862306a36Sopenharmony_ci if (iter.pos >= i_size_read(iter.inode)) { 171962306a36Sopenharmony_ci ret = VM_FAULT_SIGBUS; 172062306a36Sopenharmony_ci goto out; 172162306a36Sopenharmony_ci } 172262306a36Sopenharmony_ci 172362306a36Sopenharmony_ci if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 172462306a36Sopenharmony_ci iter.flags |= IOMAP_WRITE; 172562306a36Sopenharmony_ci 172662306a36Sopenharmony_ci entry = grab_mapping_entry(&xas, mapping, 0); 172762306a36Sopenharmony_ci if (xa_is_internal(entry)) { 172862306a36Sopenharmony_ci ret = xa_to_internal(entry); 172962306a36Sopenharmony_ci goto out; 173062306a36Sopenharmony_ci } 173162306a36Sopenharmony_ci 173262306a36Sopenharmony_ci /* 173362306a36Sopenharmony_ci * It is possible, particularly with mixed reads & writes to private 173462306a36Sopenharmony_ci * mappings, that we have raced with a PMD fault that overlaps with 173562306a36Sopenharmony_ci * the PTE we need to set up. If so just return and the fault will be 173662306a36Sopenharmony_ci * retried. 173762306a36Sopenharmony_ci */ 173862306a36Sopenharmony_ci if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { 173962306a36Sopenharmony_ci ret = VM_FAULT_NOPAGE; 174062306a36Sopenharmony_ci goto unlock_entry; 174162306a36Sopenharmony_ci } 174262306a36Sopenharmony_ci 174362306a36Sopenharmony_ci while ((error = iomap_iter(&iter, ops)) > 0) { 174462306a36Sopenharmony_ci if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { 174562306a36Sopenharmony_ci iter.processed = -EIO; /* fs corruption? */ 174662306a36Sopenharmony_ci continue; 174762306a36Sopenharmony_ci } 174862306a36Sopenharmony_ci 174962306a36Sopenharmony_ci ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false); 175062306a36Sopenharmony_ci if (ret != VM_FAULT_SIGBUS && 175162306a36Sopenharmony_ci (iter.iomap.flags & IOMAP_F_NEW)) { 175262306a36Sopenharmony_ci count_vm_event(PGMAJFAULT); 175362306a36Sopenharmony_ci count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 175462306a36Sopenharmony_ci ret |= VM_FAULT_MAJOR; 175562306a36Sopenharmony_ci } 175662306a36Sopenharmony_ci 175762306a36Sopenharmony_ci if (!(ret & VM_FAULT_ERROR)) 175862306a36Sopenharmony_ci iter.processed = PAGE_SIZE; 175962306a36Sopenharmony_ci } 176062306a36Sopenharmony_ci 176162306a36Sopenharmony_ci if (iomap_errp) 176262306a36Sopenharmony_ci *iomap_errp = error; 176362306a36Sopenharmony_ci if (!ret && error) 176462306a36Sopenharmony_ci ret = dax_fault_return(error); 176562306a36Sopenharmony_ci 176662306a36Sopenharmony_ciunlock_entry: 176762306a36Sopenharmony_ci dax_unlock_entry(&xas, entry); 176862306a36Sopenharmony_ciout: 176962306a36Sopenharmony_ci trace_dax_pte_fault_done(iter.inode, vmf, ret); 177062306a36Sopenharmony_ci return ret; 177162306a36Sopenharmony_ci} 177262306a36Sopenharmony_ci 177362306a36Sopenharmony_ci#ifdef CONFIG_FS_DAX_PMD 177462306a36Sopenharmony_cistatic bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, 177562306a36Sopenharmony_ci pgoff_t max_pgoff) 177662306a36Sopenharmony_ci{ 177762306a36Sopenharmony_ci unsigned long pmd_addr = vmf->address & PMD_MASK; 177862306a36Sopenharmony_ci bool write = vmf->flags & FAULT_FLAG_WRITE; 177962306a36Sopenharmony_ci 178062306a36Sopenharmony_ci /* 178162306a36Sopenharmony_ci * Make sure that the faulting address's PMD offset (color) matches 178262306a36Sopenharmony_ci * the PMD offset from the start of the file. This is necessary so 178362306a36Sopenharmony_ci * that a PMD range in the page table overlaps exactly with a PMD 178462306a36Sopenharmony_ci * range in the page cache. 178562306a36Sopenharmony_ci */ 178662306a36Sopenharmony_ci if ((vmf->pgoff & PG_PMD_COLOUR) != 178762306a36Sopenharmony_ci ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) 178862306a36Sopenharmony_ci return true; 178962306a36Sopenharmony_ci 179062306a36Sopenharmony_ci /* Fall back to PTEs if we're going to COW */ 179162306a36Sopenharmony_ci if (write && !(vmf->vma->vm_flags & VM_SHARED)) 179262306a36Sopenharmony_ci return true; 179362306a36Sopenharmony_ci 179462306a36Sopenharmony_ci /* If the PMD would extend outside the VMA */ 179562306a36Sopenharmony_ci if (pmd_addr < vmf->vma->vm_start) 179662306a36Sopenharmony_ci return true; 179762306a36Sopenharmony_ci if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end) 179862306a36Sopenharmony_ci return true; 179962306a36Sopenharmony_ci 180062306a36Sopenharmony_ci /* If the PMD would extend beyond the file size */ 180162306a36Sopenharmony_ci if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff) 180262306a36Sopenharmony_ci return true; 180362306a36Sopenharmony_ci 180462306a36Sopenharmony_ci return false; 180562306a36Sopenharmony_ci} 180662306a36Sopenharmony_ci 180762306a36Sopenharmony_cistatic vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, 180862306a36Sopenharmony_ci const struct iomap_ops *ops) 180962306a36Sopenharmony_ci{ 181062306a36Sopenharmony_ci struct address_space *mapping = vmf->vma->vm_file->f_mapping; 181162306a36Sopenharmony_ci XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); 181262306a36Sopenharmony_ci struct iomap_iter iter = { 181362306a36Sopenharmony_ci .inode = mapping->host, 181462306a36Sopenharmony_ci .len = PMD_SIZE, 181562306a36Sopenharmony_ci .flags = IOMAP_DAX | IOMAP_FAULT, 181662306a36Sopenharmony_ci }; 181762306a36Sopenharmony_ci vm_fault_t ret = VM_FAULT_FALLBACK; 181862306a36Sopenharmony_ci pgoff_t max_pgoff; 181962306a36Sopenharmony_ci void *entry; 182062306a36Sopenharmony_ci 182162306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_WRITE) 182262306a36Sopenharmony_ci iter.flags |= IOMAP_WRITE; 182362306a36Sopenharmony_ci 182462306a36Sopenharmony_ci /* 182562306a36Sopenharmony_ci * Check whether offset isn't beyond end of file now. Caller is 182662306a36Sopenharmony_ci * supposed to hold locks serializing us with truncate / punch hole so 182762306a36Sopenharmony_ci * this is a reliable test. 182862306a36Sopenharmony_ci */ 182962306a36Sopenharmony_ci max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE); 183062306a36Sopenharmony_ci 183162306a36Sopenharmony_ci trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0); 183262306a36Sopenharmony_ci 183362306a36Sopenharmony_ci if (xas.xa_index >= max_pgoff) { 183462306a36Sopenharmony_ci ret = VM_FAULT_SIGBUS; 183562306a36Sopenharmony_ci goto out; 183662306a36Sopenharmony_ci } 183762306a36Sopenharmony_ci 183862306a36Sopenharmony_ci if (dax_fault_check_fallback(vmf, &xas, max_pgoff)) 183962306a36Sopenharmony_ci goto fallback; 184062306a36Sopenharmony_ci 184162306a36Sopenharmony_ci /* 184262306a36Sopenharmony_ci * grab_mapping_entry() will make sure we get an empty PMD entry, 184362306a36Sopenharmony_ci * a zero PMD entry or a DAX PMD. If it can't (because a PTE 184462306a36Sopenharmony_ci * entry is already in the array, for instance), it will return 184562306a36Sopenharmony_ci * VM_FAULT_FALLBACK. 184662306a36Sopenharmony_ci */ 184762306a36Sopenharmony_ci entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); 184862306a36Sopenharmony_ci if (xa_is_internal(entry)) { 184962306a36Sopenharmony_ci ret = xa_to_internal(entry); 185062306a36Sopenharmony_ci goto fallback; 185162306a36Sopenharmony_ci } 185262306a36Sopenharmony_ci 185362306a36Sopenharmony_ci /* 185462306a36Sopenharmony_ci * It is possible, particularly with mixed reads & writes to private 185562306a36Sopenharmony_ci * mappings, that we have raced with a PTE fault that overlaps with 185662306a36Sopenharmony_ci * the PMD we need to set up. If so just return and the fault will be 185762306a36Sopenharmony_ci * retried. 185862306a36Sopenharmony_ci */ 185962306a36Sopenharmony_ci if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && 186062306a36Sopenharmony_ci !pmd_devmap(*vmf->pmd)) { 186162306a36Sopenharmony_ci ret = 0; 186262306a36Sopenharmony_ci goto unlock_entry; 186362306a36Sopenharmony_ci } 186462306a36Sopenharmony_ci 186562306a36Sopenharmony_ci iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT; 186662306a36Sopenharmony_ci while (iomap_iter(&iter, ops) > 0) { 186762306a36Sopenharmony_ci if (iomap_length(&iter) < PMD_SIZE) 186862306a36Sopenharmony_ci continue; /* actually breaks out of the loop */ 186962306a36Sopenharmony_ci 187062306a36Sopenharmony_ci ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); 187162306a36Sopenharmony_ci if (ret != VM_FAULT_FALLBACK) 187262306a36Sopenharmony_ci iter.processed = PMD_SIZE; 187362306a36Sopenharmony_ci } 187462306a36Sopenharmony_ci 187562306a36Sopenharmony_ciunlock_entry: 187662306a36Sopenharmony_ci dax_unlock_entry(&xas, entry); 187762306a36Sopenharmony_cifallback: 187862306a36Sopenharmony_ci if (ret == VM_FAULT_FALLBACK) { 187962306a36Sopenharmony_ci split_huge_pmd(vmf->vma, vmf->pmd, vmf->address); 188062306a36Sopenharmony_ci count_vm_event(THP_FAULT_FALLBACK); 188162306a36Sopenharmony_ci } 188262306a36Sopenharmony_ciout: 188362306a36Sopenharmony_ci trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret); 188462306a36Sopenharmony_ci return ret; 188562306a36Sopenharmony_ci} 188662306a36Sopenharmony_ci#else 188762306a36Sopenharmony_cistatic vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, 188862306a36Sopenharmony_ci const struct iomap_ops *ops) 188962306a36Sopenharmony_ci{ 189062306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 189162306a36Sopenharmony_ci} 189262306a36Sopenharmony_ci#endif /* CONFIG_FS_DAX_PMD */ 189362306a36Sopenharmony_ci 189462306a36Sopenharmony_ci/** 189562306a36Sopenharmony_ci * dax_iomap_fault - handle a page fault on a DAX file 189662306a36Sopenharmony_ci * @vmf: The description of the fault 189762306a36Sopenharmony_ci * @order: Order of the page to fault in 189862306a36Sopenharmony_ci * @pfnp: PFN to insert for synchronous faults if fsync is required 189962306a36Sopenharmony_ci * @iomap_errp: Storage for detailed error code in case of error 190062306a36Sopenharmony_ci * @ops: Iomap ops passed from the file system 190162306a36Sopenharmony_ci * 190262306a36Sopenharmony_ci * When a page fault occurs, filesystems may call this helper in 190362306a36Sopenharmony_ci * their fault handler for DAX files. dax_iomap_fault() assumes the caller 190462306a36Sopenharmony_ci * has done all the necessary locking for page fault to proceed 190562306a36Sopenharmony_ci * successfully. 190662306a36Sopenharmony_ci */ 190762306a36Sopenharmony_civm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, 190862306a36Sopenharmony_ci pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) 190962306a36Sopenharmony_ci{ 191062306a36Sopenharmony_ci if (order == 0) 191162306a36Sopenharmony_ci return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); 191262306a36Sopenharmony_ci else if (order == PMD_ORDER) 191362306a36Sopenharmony_ci return dax_iomap_pmd_fault(vmf, pfnp, ops); 191462306a36Sopenharmony_ci else 191562306a36Sopenharmony_ci return VM_FAULT_FALLBACK; 191662306a36Sopenharmony_ci} 191762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_iomap_fault); 191862306a36Sopenharmony_ci 191962306a36Sopenharmony_ci/* 192062306a36Sopenharmony_ci * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables 192162306a36Sopenharmony_ci * @vmf: The description of the fault 192262306a36Sopenharmony_ci * @pfn: PFN to insert 192362306a36Sopenharmony_ci * @order: Order of entry to insert. 192462306a36Sopenharmony_ci * 192562306a36Sopenharmony_ci * This function inserts a writeable PTE or PMD entry into the page tables 192662306a36Sopenharmony_ci * for an mmaped DAX file. It also marks the page cache entry as dirty. 192762306a36Sopenharmony_ci */ 192862306a36Sopenharmony_cistatic vm_fault_t 192962306a36Sopenharmony_cidax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) 193062306a36Sopenharmony_ci{ 193162306a36Sopenharmony_ci struct address_space *mapping = vmf->vma->vm_file->f_mapping; 193262306a36Sopenharmony_ci XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); 193362306a36Sopenharmony_ci void *entry; 193462306a36Sopenharmony_ci vm_fault_t ret; 193562306a36Sopenharmony_ci 193662306a36Sopenharmony_ci xas_lock_irq(&xas); 193762306a36Sopenharmony_ci entry = get_unlocked_entry(&xas, order); 193862306a36Sopenharmony_ci /* Did we race with someone splitting entry or so? */ 193962306a36Sopenharmony_ci if (!entry || dax_is_conflict(entry) || 194062306a36Sopenharmony_ci (order == 0 && !dax_is_pte_entry(entry))) { 194162306a36Sopenharmony_ci put_unlocked_entry(&xas, entry, WAKE_NEXT); 194262306a36Sopenharmony_ci xas_unlock_irq(&xas); 194362306a36Sopenharmony_ci trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, 194462306a36Sopenharmony_ci VM_FAULT_NOPAGE); 194562306a36Sopenharmony_ci return VM_FAULT_NOPAGE; 194662306a36Sopenharmony_ci } 194762306a36Sopenharmony_ci xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); 194862306a36Sopenharmony_ci dax_lock_entry(&xas, entry); 194962306a36Sopenharmony_ci xas_unlock_irq(&xas); 195062306a36Sopenharmony_ci if (order == 0) 195162306a36Sopenharmony_ci ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); 195262306a36Sopenharmony_ci#ifdef CONFIG_FS_DAX_PMD 195362306a36Sopenharmony_ci else if (order == PMD_ORDER) 195462306a36Sopenharmony_ci ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); 195562306a36Sopenharmony_ci#endif 195662306a36Sopenharmony_ci else 195762306a36Sopenharmony_ci ret = VM_FAULT_FALLBACK; 195862306a36Sopenharmony_ci dax_unlock_entry(&xas, entry); 195962306a36Sopenharmony_ci trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); 196062306a36Sopenharmony_ci return ret; 196162306a36Sopenharmony_ci} 196262306a36Sopenharmony_ci 196362306a36Sopenharmony_ci/** 196462306a36Sopenharmony_ci * dax_finish_sync_fault - finish synchronous page fault 196562306a36Sopenharmony_ci * @vmf: The description of the fault 196662306a36Sopenharmony_ci * @order: Order of entry to be inserted 196762306a36Sopenharmony_ci * @pfn: PFN to insert 196862306a36Sopenharmony_ci * 196962306a36Sopenharmony_ci * This function ensures that the file range touched by the page fault is 197062306a36Sopenharmony_ci * stored persistently on the media and handles inserting of appropriate page 197162306a36Sopenharmony_ci * table entry. 197262306a36Sopenharmony_ci */ 197362306a36Sopenharmony_civm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, 197462306a36Sopenharmony_ci pfn_t pfn) 197562306a36Sopenharmony_ci{ 197662306a36Sopenharmony_ci int err; 197762306a36Sopenharmony_ci loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; 197862306a36Sopenharmony_ci size_t len = PAGE_SIZE << order; 197962306a36Sopenharmony_ci 198062306a36Sopenharmony_ci err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); 198162306a36Sopenharmony_ci if (err) 198262306a36Sopenharmony_ci return VM_FAULT_SIGBUS; 198362306a36Sopenharmony_ci return dax_insert_pfn_mkwrite(vmf, pfn, order); 198462306a36Sopenharmony_ci} 198562306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_finish_sync_fault); 198662306a36Sopenharmony_ci 198762306a36Sopenharmony_cistatic loff_t dax_range_compare_iter(struct iomap_iter *it_src, 198862306a36Sopenharmony_ci struct iomap_iter *it_dest, u64 len, bool *same) 198962306a36Sopenharmony_ci{ 199062306a36Sopenharmony_ci const struct iomap *smap = &it_src->iomap; 199162306a36Sopenharmony_ci const struct iomap *dmap = &it_dest->iomap; 199262306a36Sopenharmony_ci loff_t pos1 = it_src->pos, pos2 = it_dest->pos; 199362306a36Sopenharmony_ci void *saddr, *daddr; 199462306a36Sopenharmony_ci int id, ret; 199562306a36Sopenharmony_ci 199662306a36Sopenharmony_ci len = min(len, min(smap->length, dmap->length)); 199762306a36Sopenharmony_ci 199862306a36Sopenharmony_ci if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { 199962306a36Sopenharmony_ci *same = true; 200062306a36Sopenharmony_ci return len; 200162306a36Sopenharmony_ci } 200262306a36Sopenharmony_ci 200362306a36Sopenharmony_ci if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { 200462306a36Sopenharmony_ci *same = false; 200562306a36Sopenharmony_ci return 0; 200662306a36Sopenharmony_ci } 200762306a36Sopenharmony_ci 200862306a36Sopenharmony_ci id = dax_read_lock(); 200962306a36Sopenharmony_ci ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE), 201062306a36Sopenharmony_ci &saddr, NULL); 201162306a36Sopenharmony_ci if (ret < 0) 201262306a36Sopenharmony_ci goto out_unlock; 201362306a36Sopenharmony_ci 201462306a36Sopenharmony_ci ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE), 201562306a36Sopenharmony_ci &daddr, NULL); 201662306a36Sopenharmony_ci if (ret < 0) 201762306a36Sopenharmony_ci goto out_unlock; 201862306a36Sopenharmony_ci 201962306a36Sopenharmony_ci *same = !memcmp(saddr, daddr, len); 202062306a36Sopenharmony_ci if (!*same) 202162306a36Sopenharmony_ci len = 0; 202262306a36Sopenharmony_ci dax_read_unlock(id); 202362306a36Sopenharmony_ci return len; 202462306a36Sopenharmony_ci 202562306a36Sopenharmony_ciout_unlock: 202662306a36Sopenharmony_ci dax_read_unlock(id); 202762306a36Sopenharmony_ci return -EIO; 202862306a36Sopenharmony_ci} 202962306a36Sopenharmony_ci 203062306a36Sopenharmony_ciint dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, 203162306a36Sopenharmony_ci struct inode *dst, loff_t dstoff, loff_t len, bool *same, 203262306a36Sopenharmony_ci const struct iomap_ops *ops) 203362306a36Sopenharmony_ci{ 203462306a36Sopenharmony_ci struct iomap_iter src_iter = { 203562306a36Sopenharmony_ci .inode = src, 203662306a36Sopenharmony_ci .pos = srcoff, 203762306a36Sopenharmony_ci .len = len, 203862306a36Sopenharmony_ci .flags = IOMAP_DAX, 203962306a36Sopenharmony_ci }; 204062306a36Sopenharmony_ci struct iomap_iter dst_iter = { 204162306a36Sopenharmony_ci .inode = dst, 204262306a36Sopenharmony_ci .pos = dstoff, 204362306a36Sopenharmony_ci .len = len, 204462306a36Sopenharmony_ci .flags = IOMAP_DAX, 204562306a36Sopenharmony_ci }; 204662306a36Sopenharmony_ci int ret, compared = 0; 204762306a36Sopenharmony_ci 204862306a36Sopenharmony_ci while ((ret = iomap_iter(&src_iter, ops)) > 0 && 204962306a36Sopenharmony_ci (ret = iomap_iter(&dst_iter, ops)) > 0) { 205062306a36Sopenharmony_ci compared = dax_range_compare_iter(&src_iter, &dst_iter, 205162306a36Sopenharmony_ci min(src_iter.len, dst_iter.len), same); 205262306a36Sopenharmony_ci if (compared < 0) 205362306a36Sopenharmony_ci return ret; 205462306a36Sopenharmony_ci src_iter.processed = dst_iter.processed = compared; 205562306a36Sopenharmony_ci } 205662306a36Sopenharmony_ci return ret; 205762306a36Sopenharmony_ci} 205862306a36Sopenharmony_ci 205962306a36Sopenharmony_ciint dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, 206062306a36Sopenharmony_ci struct file *file_out, loff_t pos_out, 206162306a36Sopenharmony_ci loff_t *len, unsigned int remap_flags, 206262306a36Sopenharmony_ci const struct iomap_ops *ops) 206362306a36Sopenharmony_ci{ 206462306a36Sopenharmony_ci return __generic_remap_file_range_prep(file_in, pos_in, file_out, 206562306a36Sopenharmony_ci pos_out, len, remap_flags, ops); 206662306a36Sopenharmony_ci} 206762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(dax_remap_file_range_prep); 2068