162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * hugetlbpage-backed filesystem. Based on ramfs. 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Nadia Yvette Chambers, 2002 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Copyright (C) 2002 Linus Torvalds. 762306a36Sopenharmony_ci * License: GPL 862306a36Sopenharmony_ci */ 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci#include <linux/thread_info.h> 1362306a36Sopenharmony_ci#include <asm/current.h> 1462306a36Sopenharmony_ci#include <linux/falloc.h> 1562306a36Sopenharmony_ci#include <linux/fs.h> 1662306a36Sopenharmony_ci#include <linux/mount.h> 1762306a36Sopenharmony_ci#include <linux/file.h> 1862306a36Sopenharmony_ci#include <linux/kernel.h> 1962306a36Sopenharmony_ci#include <linux/writeback.h> 2062306a36Sopenharmony_ci#include <linux/pagemap.h> 2162306a36Sopenharmony_ci#include <linux/highmem.h> 2262306a36Sopenharmony_ci#include <linux/init.h> 2362306a36Sopenharmony_ci#include <linux/string.h> 2462306a36Sopenharmony_ci#include <linux/capability.h> 2562306a36Sopenharmony_ci#include <linux/ctype.h> 2662306a36Sopenharmony_ci#include <linux/backing-dev.h> 2762306a36Sopenharmony_ci#include <linux/hugetlb.h> 2862306a36Sopenharmony_ci#include <linux/pagevec.h> 2962306a36Sopenharmony_ci#include <linux/fs_parser.h> 3062306a36Sopenharmony_ci#include <linux/mman.h> 3162306a36Sopenharmony_ci#include <linux/slab.h> 3262306a36Sopenharmony_ci#include <linux/dnotify.h> 3362306a36Sopenharmony_ci#include <linux/statfs.h> 3462306a36Sopenharmony_ci#include <linux/security.h> 3562306a36Sopenharmony_ci#include <linux/magic.h> 3662306a36Sopenharmony_ci#include <linux/migrate.h> 3762306a36Sopenharmony_ci#include <linux/uio.h> 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#include <linux/uaccess.h> 4062306a36Sopenharmony_ci#include <linux/sched/mm.h> 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_cistatic const struct address_space_operations hugetlbfs_aops; 4362306a36Sopenharmony_ciconst struct file_operations hugetlbfs_file_operations; 4462306a36Sopenharmony_cistatic const struct inode_operations hugetlbfs_dir_inode_operations; 4562306a36Sopenharmony_cistatic const struct inode_operations hugetlbfs_inode_operations; 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_cienum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_cistruct hugetlbfs_fs_context { 5062306a36Sopenharmony_ci struct hstate *hstate; 5162306a36Sopenharmony_ci unsigned long long max_size_opt; 5262306a36Sopenharmony_ci unsigned long long min_size_opt; 5362306a36Sopenharmony_ci long max_hpages; 5462306a36Sopenharmony_ci long nr_inodes; 5562306a36Sopenharmony_ci long min_hpages; 5662306a36Sopenharmony_ci enum hugetlbfs_size_type max_val_type; 5762306a36Sopenharmony_ci enum hugetlbfs_size_type min_val_type; 5862306a36Sopenharmony_ci kuid_t uid; 5962306a36Sopenharmony_ci kgid_t gid; 6062306a36Sopenharmony_ci umode_t mode; 6162306a36Sopenharmony_ci}; 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_ciint sysctl_hugetlb_shm_group; 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_cienum hugetlb_param { 6662306a36Sopenharmony_ci Opt_gid, 6762306a36Sopenharmony_ci Opt_min_size, 6862306a36Sopenharmony_ci Opt_mode, 6962306a36Sopenharmony_ci Opt_nr_inodes, 7062306a36Sopenharmony_ci Opt_pagesize, 7162306a36Sopenharmony_ci Opt_size, 7262306a36Sopenharmony_ci Opt_uid, 7362306a36Sopenharmony_ci}; 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_cistatic const struct fs_parameter_spec hugetlb_fs_parameters[] = { 7662306a36Sopenharmony_ci fsparam_u32 ("gid", Opt_gid), 7762306a36Sopenharmony_ci fsparam_string("min_size", Opt_min_size), 7862306a36Sopenharmony_ci fsparam_u32oct("mode", Opt_mode), 7962306a36Sopenharmony_ci fsparam_string("nr_inodes", Opt_nr_inodes), 8062306a36Sopenharmony_ci fsparam_string("pagesize", Opt_pagesize), 8162306a36Sopenharmony_ci fsparam_string("size", Opt_size), 8262306a36Sopenharmony_ci fsparam_u32 ("uid", Opt_uid), 8362306a36Sopenharmony_ci {} 8462306a36Sopenharmony_ci}; 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci#ifdef CONFIG_NUMA 8762306a36Sopenharmony_cistatic inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, 8862306a36Sopenharmony_ci struct inode *inode, pgoff_t index) 8962306a36Sopenharmony_ci{ 9062306a36Sopenharmony_ci vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, 9162306a36Sopenharmony_ci index); 9262306a36Sopenharmony_ci} 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_cistatic inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) 9562306a36Sopenharmony_ci{ 9662306a36Sopenharmony_ci mpol_cond_put(vma->vm_policy); 9762306a36Sopenharmony_ci} 9862306a36Sopenharmony_ci#else 9962306a36Sopenharmony_cistatic inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, 10062306a36Sopenharmony_ci struct inode *inode, pgoff_t index) 10162306a36Sopenharmony_ci{ 10262306a36Sopenharmony_ci} 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_cistatic inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) 10562306a36Sopenharmony_ci{ 10662306a36Sopenharmony_ci} 10762306a36Sopenharmony_ci#endif 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci/* 11062306a36Sopenharmony_ci * Mask used when checking the page offset value passed in via system 11162306a36Sopenharmony_ci * calls. This value will be converted to a loff_t which is signed. 11262306a36Sopenharmony_ci * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the 11362306a36Sopenharmony_ci * value. The extra bit (- 1 in the shift value) is to take the sign 11462306a36Sopenharmony_ci * bit into account. 11562306a36Sopenharmony_ci */ 11662306a36Sopenharmony_ci#define PGOFF_LOFFT_MAX \ 11762306a36Sopenharmony_ci (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_cistatic int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 12062306a36Sopenharmony_ci{ 12162306a36Sopenharmony_ci struct inode *inode = file_inode(file); 12262306a36Sopenharmony_ci struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); 12362306a36Sopenharmony_ci loff_t len, vma_len; 12462306a36Sopenharmony_ci int ret; 12562306a36Sopenharmony_ci struct hstate *h = hstate_file(file); 12662306a36Sopenharmony_ci vm_flags_t vm_flags; 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci /* 12962306a36Sopenharmony_ci * vma address alignment (but not the pgoff alignment) has 13062306a36Sopenharmony_ci * already been checked by prepare_hugepage_range. If you add 13162306a36Sopenharmony_ci * any error returns here, do so after setting VM_HUGETLB, so 13262306a36Sopenharmony_ci * is_vm_hugetlb_page tests below unmap_region go the right 13362306a36Sopenharmony_ci * way when do_mmap unwinds (may be important on powerpc 13462306a36Sopenharmony_ci * and ia64). 13562306a36Sopenharmony_ci */ 13662306a36Sopenharmony_ci vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); 13762306a36Sopenharmony_ci vma->vm_ops = &hugetlb_vm_ops; 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci ret = seal_check_future_write(info->seals, vma); 14062306a36Sopenharmony_ci if (ret) 14162306a36Sopenharmony_ci return ret; 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci /* 14462306a36Sopenharmony_ci * page based offset in vm_pgoff could be sufficiently large to 14562306a36Sopenharmony_ci * overflow a loff_t when converted to byte offset. This can 14662306a36Sopenharmony_ci * only happen on architectures where sizeof(loff_t) == 14762306a36Sopenharmony_ci * sizeof(unsigned long). So, only check in those instances. 14862306a36Sopenharmony_ci */ 14962306a36Sopenharmony_ci if (sizeof(unsigned long) == sizeof(loff_t)) { 15062306a36Sopenharmony_ci if (vma->vm_pgoff & PGOFF_LOFFT_MAX) 15162306a36Sopenharmony_ci return -EINVAL; 15262306a36Sopenharmony_ci } 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci /* must be huge page aligned */ 15562306a36Sopenharmony_ci if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) 15662306a36Sopenharmony_ci return -EINVAL; 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci vma_len = (loff_t)(vma->vm_end - vma->vm_start); 15962306a36Sopenharmony_ci len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 16062306a36Sopenharmony_ci /* check for overflow */ 16162306a36Sopenharmony_ci if (len < vma_len) 16262306a36Sopenharmony_ci return -EINVAL; 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci inode_lock(inode); 16562306a36Sopenharmony_ci file_accessed(file); 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci ret = -ENOMEM; 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci vm_flags = vma->vm_flags; 17062306a36Sopenharmony_ci /* 17162306a36Sopenharmony_ci * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip 17262306a36Sopenharmony_ci * reserving here. Note: only for SHM hugetlbfs file, the inode 17362306a36Sopenharmony_ci * flag S_PRIVATE is set. 17462306a36Sopenharmony_ci */ 17562306a36Sopenharmony_ci if (inode->i_flags & S_PRIVATE) 17662306a36Sopenharmony_ci vm_flags |= VM_NORESERVE; 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci if (!hugetlb_reserve_pages(inode, 17962306a36Sopenharmony_ci vma->vm_pgoff >> huge_page_order(h), 18062306a36Sopenharmony_ci len >> huge_page_shift(h), vma, 18162306a36Sopenharmony_ci vm_flags)) 18262306a36Sopenharmony_ci goto out; 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci ret = 0; 18562306a36Sopenharmony_ci if (vma->vm_flags & VM_WRITE && inode->i_size < len) 18662306a36Sopenharmony_ci i_size_write(inode, len); 18762306a36Sopenharmony_ciout: 18862306a36Sopenharmony_ci inode_unlock(inode); 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci return ret; 19162306a36Sopenharmony_ci} 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci/* 19462306a36Sopenharmony_ci * Called under mmap_write_lock(mm). 19562306a36Sopenharmony_ci */ 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_cistatic unsigned long 19862306a36Sopenharmony_cihugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, 19962306a36Sopenharmony_ci unsigned long len, unsigned long pgoff, unsigned long flags) 20062306a36Sopenharmony_ci{ 20162306a36Sopenharmony_ci struct hstate *h = hstate_file(file); 20262306a36Sopenharmony_ci struct vm_unmapped_area_info info; 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci info.flags = 0; 20562306a36Sopenharmony_ci info.length = len; 20662306a36Sopenharmony_ci info.low_limit = current->mm->mmap_base; 20762306a36Sopenharmony_ci info.high_limit = arch_get_mmap_end(addr, len, flags); 20862306a36Sopenharmony_ci info.align_mask = PAGE_MASK & ~huge_page_mask(h); 20962306a36Sopenharmony_ci info.align_offset = 0; 21062306a36Sopenharmony_ci return vm_unmapped_area(&info); 21162306a36Sopenharmony_ci} 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_cistatic unsigned long 21462306a36Sopenharmony_cihugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, 21562306a36Sopenharmony_ci unsigned long len, unsigned long pgoff, unsigned long flags) 21662306a36Sopenharmony_ci{ 21762306a36Sopenharmony_ci struct hstate *h = hstate_file(file); 21862306a36Sopenharmony_ci struct vm_unmapped_area_info info; 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci info.flags = VM_UNMAPPED_AREA_TOPDOWN; 22162306a36Sopenharmony_ci info.length = len; 22262306a36Sopenharmony_ci info.low_limit = PAGE_SIZE; 22362306a36Sopenharmony_ci info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); 22462306a36Sopenharmony_ci info.align_mask = PAGE_MASK & ~huge_page_mask(h); 22562306a36Sopenharmony_ci info.align_offset = 0; 22662306a36Sopenharmony_ci addr = vm_unmapped_area(&info); 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci /* 22962306a36Sopenharmony_ci * A failed mmap() very likely causes application failure, 23062306a36Sopenharmony_ci * so fall back to the bottom-up function here. This scenario 23162306a36Sopenharmony_ci * can happen with large stack limits and large mmap() 23262306a36Sopenharmony_ci * allocations. 23362306a36Sopenharmony_ci */ 23462306a36Sopenharmony_ci if (unlikely(offset_in_page(addr))) { 23562306a36Sopenharmony_ci VM_BUG_ON(addr != -ENOMEM); 23662306a36Sopenharmony_ci info.flags = 0; 23762306a36Sopenharmony_ci info.low_limit = current->mm->mmap_base; 23862306a36Sopenharmony_ci info.high_limit = arch_get_mmap_end(addr, len, flags); 23962306a36Sopenharmony_ci addr = vm_unmapped_area(&info); 24062306a36Sopenharmony_ci } 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_ci return addr; 24362306a36Sopenharmony_ci} 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ciunsigned long 24662306a36Sopenharmony_cigeneric_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 24762306a36Sopenharmony_ci unsigned long len, unsigned long pgoff, 24862306a36Sopenharmony_ci unsigned long flags) 24962306a36Sopenharmony_ci{ 25062306a36Sopenharmony_ci struct mm_struct *mm = current->mm; 25162306a36Sopenharmony_ci struct vm_area_struct *vma; 25262306a36Sopenharmony_ci struct hstate *h = hstate_file(file); 25362306a36Sopenharmony_ci const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci if (len & ~huge_page_mask(h)) 25662306a36Sopenharmony_ci return -EINVAL; 25762306a36Sopenharmony_ci if (len > TASK_SIZE) 25862306a36Sopenharmony_ci return -ENOMEM; 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci if (flags & MAP_FIXED) { 26162306a36Sopenharmony_ci if (prepare_hugepage_range(file, addr, len)) 26262306a36Sopenharmony_ci return -EINVAL; 26362306a36Sopenharmony_ci return addr; 26462306a36Sopenharmony_ci } 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_ci if (addr) { 26762306a36Sopenharmony_ci addr = ALIGN(addr, huge_page_size(h)); 26862306a36Sopenharmony_ci vma = find_vma(mm, addr); 26962306a36Sopenharmony_ci if (mmap_end - len >= addr && 27062306a36Sopenharmony_ci (!vma || addr + len <= vm_start_gap(vma))) 27162306a36Sopenharmony_ci return addr; 27262306a36Sopenharmony_ci } 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci /* 27562306a36Sopenharmony_ci * Use mm->get_unmapped_area value as a hint to use topdown routine. 27662306a36Sopenharmony_ci * If architectures have special needs, they should define their own 27762306a36Sopenharmony_ci * version of hugetlb_get_unmapped_area. 27862306a36Sopenharmony_ci */ 27962306a36Sopenharmony_ci if (mm->get_unmapped_area == arch_get_unmapped_area_topdown) 28062306a36Sopenharmony_ci return hugetlb_get_unmapped_area_topdown(file, addr, len, 28162306a36Sopenharmony_ci pgoff, flags); 28262306a36Sopenharmony_ci return hugetlb_get_unmapped_area_bottomup(file, addr, len, 28362306a36Sopenharmony_ci pgoff, flags); 28462306a36Sopenharmony_ci} 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 28762306a36Sopenharmony_cistatic unsigned long 28862306a36Sopenharmony_cihugetlb_get_unmapped_area(struct file *file, unsigned long addr, 28962306a36Sopenharmony_ci unsigned long len, unsigned long pgoff, 29062306a36Sopenharmony_ci unsigned long flags) 29162306a36Sopenharmony_ci{ 29262306a36Sopenharmony_ci return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); 29362306a36Sopenharmony_ci} 29462306a36Sopenharmony_ci#endif 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci/* 29762306a36Sopenharmony_ci * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. 29862306a36Sopenharmony_ci * Returns the maximum number of bytes one can read without touching the 1st raw 29962306a36Sopenharmony_ci * HWPOISON subpage. 30062306a36Sopenharmony_ci * 30162306a36Sopenharmony_ci * The implementation borrows the iteration logic from copy_page_to_iter*. 30262306a36Sopenharmony_ci */ 30362306a36Sopenharmony_cistatic size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) 30462306a36Sopenharmony_ci{ 30562306a36Sopenharmony_ci size_t n = 0; 30662306a36Sopenharmony_ci size_t res = 0; 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci /* First subpage to start the loop. */ 30962306a36Sopenharmony_ci page = nth_page(page, offset / PAGE_SIZE); 31062306a36Sopenharmony_ci offset %= PAGE_SIZE; 31162306a36Sopenharmony_ci while (1) { 31262306a36Sopenharmony_ci if (is_raw_hwpoison_page_in_hugepage(page)) 31362306a36Sopenharmony_ci break; 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci /* Safe to read n bytes without touching HWPOISON subpage. */ 31662306a36Sopenharmony_ci n = min(bytes, (size_t)PAGE_SIZE - offset); 31762306a36Sopenharmony_ci res += n; 31862306a36Sopenharmony_ci bytes -= n; 31962306a36Sopenharmony_ci if (!bytes || !n) 32062306a36Sopenharmony_ci break; 32162306a36Sopenharmony_ci offset += n; 32262306a36Sopenharmony_ci if (offset == PAGE_SIZE) { 32362306a36Sopenharmony_ci page = nth_page(page, 1); 32462306a36Sopenharmony_ci offset = 0; 32562306a36Sopenharmony_ci } 32662306a36Sopenharmony_ci } 32762306a36Sopenharmony_ci 32862306a36Sopenharmony_ci return res; 32962306a36Sopenharmony_ci} 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci/* 33262306a36Sopenharmony_ci * Support for read() - Find the page attached to f_mapping and copy out the 33362306a36Sopenharmony_ci * data. This provides functionality similar to filemap_read(). 33462306a36Sopenharmony_ci */ 33562306a36Sopenharmony_cistatic ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) 33662306a36Sopenharmony_ci{ 33762306a36Sopenharmony_ci struct file *file = iocb->ki_filp; 33862306a36Sopenharmony_ci struct hstate *h = hstate_file(file); 33962306a36Sopenharmony_ci struct address_space *mapping = file->f_mapping; 34062306a36Sopenharmony_ci struct inode *inode = mapping->host; 34162306a36Sopenharmony_ci unsigned long index = iocb->ki_pos >> huge_page_shift(h); 34262306a36Sopenharmony_ci unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); 34362306a36Sopenharmony_ci unsigned long end_index; 34462306a36Sopenharmony_ci loff_t isize; 34562306a36Sopenharmony_ci ssize_t retval = 0; 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci while (iov_iter_count(to)) { 34862306a36Sopenharmony_ci struct page *page; 34962306a36Sopenharmony_ci size_t nr, copied, want; 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_ci /* nr is the maximum number of bytes to copy from this page */ 35262306a36Sopenharmony_ci nr = huge_page_size(h); 35362306a36Sopenharmony_ci isize = i_size_read(inode); 35462306a36Sopenharmony_ci if (!isize) 35562306a36Sopenharmony_ci break; 35662306a36Sopenharmony_ci end_index = (isize - 1) >> huge_page_shift(h); 35762306a36Sopenharmony_ci if (index > end_index) 35862306a36Sopenharmony_ci break; 35962306a36Sopenharmony_ci if (index == end_index) { 36062306a36Sopenharmony_ci nr = ((isize - 1) & ~huge_page_mask(h)) + 1; 36162306a36Sopenharmony_ci if (nr <= offset) 36262306a36Sopenharmony_ci break; 36362306a36Sopenharmony_ci } 36462306a36Sopenharmony_ci nr = nr - offset; 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_ci /* Find the page */ 36762306a36Sopenharmony_ci page = find_lock_page(mapping, index); 36862306a36Sopenharmony_ci if (unlikely(page == NULL)) { 36962306a36Sopenharmony_ci /* 37062306a36Sopenharmony_ci * We have a HOLE, zero out the user-buffer for the 37162306a36Sopenharmony_ci * length of the hole or request. 37262306a36Sopenharmony_ci */ 37362306a36Sopenharmony_ci copied = iov_iter_zero(nr, to); 37462306a36Sopenharmony_ci } else { 37562306a36Sopenharmony_ci unlock_page(page); 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci if (!PageHWPoison(page)) 37862306a36Sopenharmony_ci want = nr; 37962306a36Sopenharmony_ci else { 38062306a36Sopenharmony_ci /* 38162306a36Sopenharmony_ci * Adjust how many bytes safe to read without 38262306a36Sopenharmony_ci * touching the 1st raw HWPOISON subpage after 38362306a36Sopenharmony_ci * offset. 38462306a36Sopenharmony_ci */ 38562306a36Sopenharmony_ci want = adjust_range_hwpoison(page, offset, nr); 38662306a36Sopenharmony_ci if (want == 0) { 38762306a36Sopenharmony_ci put_page(page); 38862306a36Sopenharmony_ci retval = -EIO; 38962306a36Sopenharmony_ci break; 39062306a36Sopenharmony_ci } 39162306a36Sopenharmony_ci } 39262306a36Sopenharmony_ci 39362306a36Sopenharmony_ci /* 39462306a36Sopenharmony_ci * We have the page, copy it to user space buffer. 39562306a36Sopenharmony_ci */ 39662306a36Sopenharmony_ci copied = copy_page_to_iter(page, offset, want, to); 39762306a36Sopenharmony_ci put_page(page); 39862306a36Sopenharmony_ci } 39962306a36Sopenharmony_ci offset += copied; 40062306a36Sopenharmony_ci retval += copied; 40162306a36Sopenharmony_ci if (copied != nr && iov_iter_count(to)) { 40262306a36Sopenharmony_ci if (!retval) 40362306a36Sopenharmony_ci retval = -EFAULT; 40462306a36Sopenharmony_ci break; 40562306a36Sopenharmony_ci } 40662306a36Sopenharmony_ci index += offset >> huge_page_shift(h); 40762306a36Sopenharmony_ci offset &= ~huge_page_mask(h); 40862306a36Sopenharmony_ci } 40962306a36Sopenharmony_ci iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; 41062306a36Sopenharmony_ci return retval; 41162306a36Sopenharmony_ci} 41262306a36Sopenharmony_ci 41362306a36Sopenharmony_cistatic int hugetlbfs_write_begin(struct file *file, 41462306a36Sopenharmony_ci struct address_space *mapping, 41562306a36Sopenharmony_ci loff_t pos, unsigned len, 41662306a36Sopenharmony_ci struct page **pagep, void **fsdata) 41762306a36Sopenharmony_ci{ 41862306a36Sopenharmony_ci return -EINVAL; 41962306a36Sopenharmony_ci} 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_cistatic int hugetlbfs_write_end(struct file *file, struct address_space *mapping, 42262306a36Sopenharmony_ci loff_t pos, unsigned len, unsigned copied, 42362306a36Sopenharmony_ci struct page *page, void *fsdata) 42462306a36Sopenharmony_ci{ 42562306a36Sopenharmony_ci BUG(); 42662306a36Sopenharmony_ci return -EINVAL; 42762306a36Sopenharmony_ci} 42862306a36Sopenharmony_ci 42962306a36Sopenharmony_cistatic void hugetlb_delete_from_page_cache(struct folio *folio) 43062306a36Sopenharmony_ci{ 43162306a36Sopenharmony_ci folio_clear_dirty(folio); 43262306a36Sopenharmony_ci folio_clear_uptodate(folio); 43362306a36Sopenharmony_ci filemap_remove_folio(folio); 43462306a36Sopenharmony_ci} 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_ci/* 43762306a36Sopenharmony_ci * Called with i_mmap_rwsem held for inode based vma maps. This makes 43862306a36Sopenharmony_ci * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault 43962306a36Sopenharmony_ci * mutex for the page in the mapping. So, we can not race with page being 44062306a36Sopenharmony_ci * faulted into the vma. 44162306a36Sopenharmony_ci */ 44262306a36Sopenharmony_cistatic bool hugetlb_vma_maps_page(struct vm_area_struct *vma, 44362306a36Sopenharmony_ci unsigned long addr, struct page *page) 44462306a36Sopenharmony_ci{ 44562306a36Sopenharmony_ci pte_t *ptep, pte; 44662306a36Sopenharmony_ci 44762306a36Sopenharmony_ci ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma))); 44862306a36Sopenharmony_ci if (!ptep) 44962306a36Sopenharmony_ci return false; 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci pte = huge_ptep_get(ptep); 45262306a36Sopenharmony_ci if (huge_pte_none(pte) || !pte_present(pte)) 45362306a36Sopenharmony_ci return false; 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci if (pte_page(pte) == page) 45662306a36Sopenharmony_ci return true; 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci return false; 45962306a36Sopenharmony_ci} 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_ci/* 46262306a36Sopenharmony_ci * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? 46362306a36Sopenharmony_ci * No, because the interval tree returns us only those vmas 46462306a36Sopenharmony_ci * which overlap the truncated area starting at pgoff, 46562306a36Sopenharmony_ci * and no vma on a 32-bit arch can span beyond the 4GB. 46662306a36Sopenharmony_ci */ 46762306a36Sopenharmony_cistatic unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) 46862306a36Sopenharmony_ci{ 46962306a36Sopenharmony_ci unsigned long offset = 0; 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci if (vma->vm_pgoff < start) 47262306a36Sopenharmony_ci offset = (start - vma->vm_pgoff) << PAGE_SHIFT; 47362306a36Sopenharmony_ci 47462306a36Sopenharmony_ci return vma->vm_start + offset; 47562306a36Sopenharmony_ci} 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_cistatic unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) 47862306a36Sopenharmony_ci{ 47962306a36Sopenharmony_ci unsigned long t_end; 48062306a36Sopenharmony_ci 48162306a36Sopenharmony_ci if (!end) 48262306a36Sopenharmony_ci return vma->vm_end; 48362306a36Sopenharmony_ci 48462306a36Sopenharmony_ci t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; 48562306a36Sopenharmony_ci if (t_end > vma->vm_end) 48662306a36Sopenharmony_ci t_end = vma->vm_end; 48762306a36Sopenharmony_ci return t_end; 48862306a36Sopenharmony_ci} 48962306a36Sopenharmony_ci 49062306a36Sopenharmony_ci/* 49162306a36Sopenharmony_ci * Called with hugetlb fault mutex held. Therefore, no more mappings to 49262306a36Sopenharmony_ci * this folio can be created while executing the routine. 49362306a36Sopenharmony_ci */ 49462306a36Sopenharmony_cistatic void hugetlb_unmap_file_folio(struct hstate *h, 49562306a36Sopenharmony_ci struct address_space *mapping, 49662306a36Sopenharmony_ci struct folio *folio, pgoff_t index) 49762306a36Sopenharmony_ci{ 49862306a36Sopenharmony_ci struct rb_root_cached *root = &mapping->i_mmap; 49962306a36Sopenharmony_ci struct hugetlb_vma_lock *vma_lock; 50062306a36Sopenharmony_ci struct page *page = &folio->page; 50162306a36Sopenharmony_ci struct vm_area_struct *vma; 50262306a36Sopenharmony_ci unsigned long v_start; 50362306a36Sopenharmony_ci unsigned long v_end; 50462306a36Sopenharmony_ci pgoff_t start, end; 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci start = index * pages_per_huge_page(h); 50762306a36Sopenharmony_ci end = (index + 1) * pages_per_huge_page(h); 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci i_mmap_lock_write(mapping); 51062306a36Sopenharmony_ciretry: 51162306a36Sopenharmony_ci vma_lock = NULL; 51262306a36Sopenharmony_ci vma_interval_tree_foreach(vma, root, start, end - 1) { 51362306a36Sopenharmony_ci v_start = vma_offset_start(vma, start); 51462306a36Sopenharmony_ci v_end = vma_offset_end(vma, end); 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_ci if (!hugetlb_vma_maps_page(vma, v_start, page)) 51762306a36Sopenharmony_ci continue; 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_ci if (!hugetlb_vma_trylock_write(vma)) { 52062306a36Sopenharmony_ci vma_lock = vma->vm_private_data; 52162306a36Sopenharmony_ci /* 52262306a36Sopenharmony_ci * If we can not get vma lock, we need to drop 52362306a36Sopenharmony_ci * immap_sema and take locks in order. First, 52462306a36Sopenharmony_ci * take a ref on the vma_lock structure so that 52562306a36Sopenharmony_ci * we can be guaranteed it will not go away when 52662306a36Sopenharmony_ci * dropping immap_sema. 52762306a36Sopenharmony_ci */ 52862306a36Sopenharmony_ci kref_get(&vma_lock->refs); 52962306a36Sopenharmony_ci break; 53062306a36Sopenharmony_ci } 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ci unmap_hugepage_range(vma, v_start, v_end, NULL, 53362306a36Sopenharmony_ci ZAP_FLAG_DROP_MARKER); 53462306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 53562306a36Sopenharmony_ci } 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci i_mmap_unlock_write(mapping); 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_ci if (vma_lock) { 54062306a36Sopenharmony_ci /* 54162306a36Sopenharmony_ci * Wait on vma_lock. We know it is still valid as we have 54262306a36Sopenharmony_ci * a reference. We must 'open code' vma locking as we do 54362306a36Sopenharmony_ci * not know if vma_lock is still attached to vma. 54462306a36Sopenharmony_ci */ 54562306a36Sopenharmony_ci down_write(&vma_lock->rw_sema); 54662306a36Sopenharmony_ci i_mmap_lock_write(mapping); 54762306a36Sopenharmony_ci 54862306a36Sopenharmony_ci vma = vma_lock->vma; 54962306a36Sopenharmony_ci if (!vma) { 55062306a36Sopenharmony_ci /* 55162306a36Sopenharmony_ci * If lock is no longer attached to vma, then just 55262306a36Sopenharmony_ci * unlock, drop our reference and retry looking for 55362306a36Sopenharmony_ci * other vmas. 55462306a36Sopenharmony_ci */ 55562306a36Sopenharmony_ci up_write(&vma_lock->rw_sema); 55662306a36Sopenharmony_ci kref_put(&vma_lock->refs, hugetlb_vma_lock_release); 55762306a36Sopenharmony_ci goto retry; 55862306a36Sopenharmony_ci } 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_ci /* 56162306a36Sopenharmony_ci * vma_lock is still attached to vma. Check to see if vma 56262306a36Sopenharmony_ci * still maps page and if so, unmap. 56362306a36Sopenharmony_ci */ 56462306a36Sopenharmony_ci v_start = vma_offset_start(vma, start); 56562306a36Sopenharmony_ci v_end = vma_offset_end(vma, end); 56662306a36Sopenharmony_ci if (hugetlb_vma_maps_page(vma, v_start, page)) 56762306a36Sopenharmony_ci unmap_hugepage_range(vma, v_start, v_end, NULL, 56862306a36Sopenharmony_ci ZAP_FLAG_DROP_MARKER); 56962306a36Sopenharmony_ci 57062306a36Sopenharmony_ci kref_put(&vma_lock->refs, hugetlb_vma_lock_release); 57162306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci goto retry; 57462306a36Sopenharmony_ci } 57562306a36Sopenharmony_ci} 57662306a36Sopenharmony_ci 57762306a36Sopenharmony_cistatic void 57862306a36Sopenharmony_cihugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, 57962306a36Sopenharmony_ci zap_flags_t zap_flags) 58062306a36Sopenharmony_ci{ 58162306a36Sopenharmony_ci struct vm_area_struct *vma; 58262306a36Sopenharmony_ci 58362306a36Sopenharmony_ci /* 58462306a36Sopenharmony_ci * end == 0 indicates that the entire range after start should be 58562306a36Sopenharmony_ci * unmapped. Note, end is exclusive, whereas the interval tree takes 58662306a36Sopenharmony_ci * an inclusive "last". 58762306a36Sopenharmony_ci */ 58862306a36Sopenharmony_ci vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { 58962306a36Sopenharmony_ci unsigned long v_start; 59062306a36Sopenharmony_ci unsigned long v_end; 59162306a36Sopenharmony_ci 59262306a36Sopenharmony_ci if (!hugetlb_vma_trylock_write(vma)) 59362306a36Sopenharmony_ci continue; 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci v_start = vma_offset_start(vma, start); 59662306a36Sopenharmony_ci v_end = vma_offset_end(vma, end); 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags); 59962306a36Sopenharmony_ci 60062306a36Sopenharmony_ci /* 60162306a36Sopenharmony_ci * Note that vma lock only exists for shared/non-private 60262306a36Sopenharmony_ci * vmas. Therefore, lock is not held when calling 60362306a36Sopenharmony_ci * unmap_hugepage_range for private vmas. 60462306a36Sopenharmony_ci */ 60562306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 60662306a36Sopenharmony_ci } 60762306a36Sopenharmony_ci} 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_ci/* 61062306a36Sopenharmony_ci * Called with hugetlb fault mutex held. 61162306a36Sopenharmony_ci * Returns true if page was actually removed, false otherwise. 61262306a36Sopenharmony_ci */ 61362306a36Sopenharmony_cistatic bool remove_inode_single_folio(struct hstate *h, struct inode *inode, 61462306a36Sopenharmony_ci struct address_space *mapping, 61562306a36Sopenharmony_ci struct folio *folio, pgoff_t index, 61662306a36Sopenharmony_ci bool truncate_op) 61762306a36Sopenharmony_ci{ 61862306a36Sopenharmony_ci bool ret = false; 61962306a36Sopenharmony_ci 62062306a36Sopenharmony_ci /* 62162306a36Sopenharmony_ci * If folio is mapped, it was faulted in after being 62262306a36Sopenharmony_ci * unmapped in caller. Unmap (again) while holding 62362306a36Sopenharmony_ci * the fault mutex. The mutex will prevent faults 62462306a36Sopenharmony_ci * until we finish removing the folio. 62562306a36Sopenharmony_ci */ 62662306a36Sopenharmony_ci if (unlikely(folio_mapped(folio))) 62762306a36Sopenharmony_ci hugetlb_unmap_file_folio(h, mapping, folio, index); 62862306a36Sopenharmony_ci 62962306a36Sopenharmony_ci folio_lock(folio); 63062306a36Sopenharmony_ci /* 63162306a36Sopenharmony_ci * We must remove the folio from page cache before removing 63262306a36Sopenharmony_ci * the region/ reserve map (hugetlb_unreserve_pages). In 63362306a36Sopenharmony_ci * rare out of memory conditions, removal of the region/reserve 63462306a36Sopenharmony_ci * map could fail. Correspondingly, the subpool and global 63562306a36Sopenharmony_ci * reserve usage count can need to be adjusted. 63662306a36Sopenharmony_ci */ 63762306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); 63862306a36Sopenharmony_ci hugetlb_delete_from_page_cache(folio); 63962306a36Sopenharmony_ci ret = true; 64062306a36Sopenharmony_ci if (!truncate_op) { 64162306a36Sopenharmony_ci if (unlikely(hugetlb_unreserve_pages(inode, index, 64262306a36Sopenharmony_ci index + 1, 1))) 64362306a36Sopenharmony_ci hugetlb_fix_reserve_counts(inode); 64462306a36Sopenharmony_ci } 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_ci folio_unlock(folio); 64762306a36Sopenharmony_ci return ret; 64862306a36Sopenharmony_ci} 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci/* 65162306a36Sopenharmony_ci * remove_inode_hugepages handles two distinct cases: truncation and hole 65262306a36Sopenharmony_ci * punch. There are subtle differences in operation for each case. 65362306a36Sopenharmony_ci * 65462306a36Sopenharmony_ci * truncation is indicated by end of range being LLONG_MAX 65562306a36Sopenharmony_ci * In this case, we first scan the range and release found pages. 65662306a36Sopenharmony_ci * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve 65762306a36Sopenharmony_ci * maps and global counts. Page faults can race with truncation. 65862306a36Sopenharmony_ci * During faults, hugetlb_no_page() checks i_size before page allocation, 65962306a36Sopenharmony_ci * and again after obtaining page table lock. It will 'back out' 66062306a36Sopenharmony_ci * allocations in the truncated range. 66162306a36Sopenharmony_ci * hole punch is indicated if end is not LLONG_MAX 66262306a36Sopenharmony_ci * In the hole punch case we scan the range and release found pages. 66362306a36Sopenharmony_ci * Only when releasing a page is the associated region/reserve map 66462306a36Sopenharmony_ci * deleted. The region/reserve map for ranges without associated 66562306a36Sopenharmony_ci * pages are not modified. Page faults can race with hole punch. 66662306a36Sopenharmony_ci * This is indicated if we find a mapped page. 66762306a36Sopenharmony_ci * Note: If the passed end of range value is beyond the end of file, but 66862306a36Sopenharmony_ci * not LLONG_MAX this routine still performs a hole punch operation. 66962306a36Sopenharmony_ci */ 67062306a36Sopenharmony_cistatic void remove_inode_hugepages(struct inode *inode, loff_t lstart, 67162306a36Sopenharmony_ci loff_t lend) 67262306a36Sopenharmony_ci{ 67362306a36Sopenharmony_ci struct hstate *h = hstate_inode(inode); 67462306a36Sopenharmony_ci struct address_space *mapping = &inode->i_data; 67562306a36Sopenharmony_ci const pgoff_t start = lstart >> huge_page_shift(h); 67662306a36Sopenharmony_ci const pgoff_t end = lend >> huge_page_shift(h); 67762306a36Sopenharmony_ci struct folio_batch fbatch; 67862306a36Sopenharmony_ci pgoff_t next, index; 67962306a36Sopenharmony_ci int i, freed = 0; 68062306a36Sopenharmony_ci bool truncate_op = (lend == LLONG_MAX); 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_ci folio_batch_init(&fbatch); 68362306a36Sopenharmony_ci next = start; 68462306a36Sopenharmony_ci while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { 68562306a36Sopenharmony_ci for (i = 0; i < folio_batch_count(&fbatch); ++i) { 68662306a36Sopenharmony_ci struct folio *folio = fbatch.folios[i]; 68762306a36Sopenharmony_ci u32 hash = 0; 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci index = folio->index; 69062306a36Sopenharmony_ci hash = hugetlb_fault_mutex_hash(mapping, index); 69162306a36Sopenharmony_ci mutex_lock(&hugetlb_fault_mutex_table[hash]); 69262306a36Sopenharmony_ci 69362306a36Sopenharmony_ci /* 69462306a36Sopenharmony_ci * Remove folio that was part of folio_batch. 69562306a36Sopenharmony_ci */ 69662306a36Sopenharmony_ci if (remove_inode_single_folio(h, inode, mapping, folio, 69762306a36Sopenharmony_ci index, truncate_op)) 69862306a36Sopenharmony_ci freed++; 69962306a36Sopenharmony_ci 70062306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 70162306a36Sopenharmony_ci } 70262306a36Sopenharmony_ci folio_batch_release(&fbatch); 70362306a36Sopenharmony_ci cond_resched(); 70462306a36Sopenharmony_ci } 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_ci if (truncate_op) 70762306a36Sopenharmony_ci (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed); 70862306a36Sopenharmony_ci} 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_cistatic void hugetlbfs_evict_inode(struct inode *inode) 71162306a36Sopenharmony_ci{ 71262306a36Sopenharmony_ci struct resv_map *resv_map; 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_ci remove_inode_hugepages(inode, 0, LLONG_MAX); 71562306a36Sopenharmony_ci 71662306a36Sopenharmony_ci /* 71762306a36Sopenharmony_ci * Get the resv_map from the address space embedded in the inode. 71862306a36Sopenharmony_ci * This is the address space which points to any resv_map allocated 71962306a36Sopenharmony_ci * at inode creation time. If this is a device special inode, 72062306a36Sopenharmony_ci * i_mapping may not point to the original address space. 72162306a36Sopenharmony_ci */ 72262306a36Sopenharmony_ci resv_map = (struct resv_map *)(&inode->i_data)->private_data; 72362306a36Sopenharmony_ci /* Only regular and link inodes have associated reserve maps */ 72462306a36Sopenharmony_ci if (resv_map) 72562306a36Sopenharmony_ci resv_map_release(&resv_map->refs); 72662306a36Sopenharmony_ci clear_inode(inode); 72762306a36Sopenharmony_ci} 72862306a36Sopenharmony_ci 72962306a36Sopenharmony_cistatic void hugetlb_vmtruncate(struct inode *inode, loff_t offset) 73062306a36Sopenharmony_ci{ 73162306a36Sopenharmony_ci pgoff_t pgoff; 73262306a36Sopenharmony_ci struct address_space *mapping = inode->i_mapping; 73362306a36Sopenharmony_ci struct hstate *h = hstate_inode(inode); 73462306a36Sopenharmony_ci 73562306a36Sopenharmony_ci BUG_ON(offset & ~huge_page_mask(h)); 73662306a36Sopenharmony_ci pgoff = offset >> PAGE_SHIFT; 73762306a36Sopenharmony_ci 73862306a36Sopenharmony_ci i_size_write(inode, offset); 73962306a36Sopenharmony_ci i_mmap_lock_write(mapping); 74062306a36Sopenharmony_ci if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) 74162306a36Sopenharmony_ci hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, 74262306a36Sopenharmony_ci ZAP_FLAG_DROP_MARKER); 74362306a36Sopenharmony_ci i_mmap_unlock_write(mapping); 74462306a36Sopenharmony_ci remove_inode_hugepages(inode, offset, LLONG_MAX); 74562306a36Sopenharmony_ci} 74662306a36Sopenharmony_ci 74762306a36Sopenharmony_cistatic void hugetlbfs_zero_partial_page(struct hstate *h, 74862306a36Sopenharmony_ci struct address_space *mapping, 74962306a36Sopenharmony_ci loff_t start, 75062306a36Sopenharmony_ci loff_t end) 75162306a36Sopenharmony_ci{ 75262306a36Sopenharmony_ci pgoff_t idx = start >> huge_page_shift(h); 75362306a36Sopenharmony_ci struct folio *folio; 75462306a36Sopenharmony_ci 75562306a36Sopenharmony_ci folio = filemap_lock_folio(mapping, idx); 75662306a36Sopenharmony_ci if (IS_ERR(folio)) 75762306a36Sopenharmony_ci return; 75862306a36Sopenharmony_ci 75962306a36Sopenharmony_ci start = start & ~huge_page_mask(h); 76062306a36Sopenharmony_ci end = end & ~huge_page_mask(h); 76162306a36Sopenharmony_ci if (!end) 76262306a36Sopenharmony_ci end = huge_page_size(h); 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_ci folio_zero_segment(folio, (size_t)start, (size_t)end); 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci folio_unlock(folio); 76762306a36Sopenharmony_ci folio_put(folio); 76862306a36Sopenharmony_ci} 76962306a36Sopenharmony_ci 77062306a36Sopenharmony_cistatic long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) 77162306a36Sopenharmony_ci{ 77262306a36Sopenharmony_ci struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); 77362306a36Sopenharmony_ci struct address_space *mapping = inode->i_mapping; 77462306a36Sopenharmony_ci struct hstate *h = hstate_inode(inode); 77562306a36Sopenharmony_ci loff_t hpage_size = huge_page_size(h); 77662306a36Sopenharmony_ci loff_t hole_start, hole_end; 77762306a36Sopenharmony_ci 77862306a36Sopenharmony_ci /* 77962306a36Sopenharmony_ci * hole_start and hole_end indicate the full pages within the hole. 78062306a36Sopenharmony_ci */ 78162306a36Sopenharmony_ci hole_start = round_up(offset, hpage_size); 78262306a36Sopenharmony_ci hole_end = round_down(offset + len, hpage_size); 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_ci inode_lock(inode); 78562306a36Sopenharmony_ci 78662306a36Sopenharmony_ci /* protected by i_rwsem */ 78762306a36Sopenharmony_ci if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { 78862306a36Sopenharmony_ci inode_unlock(inode); 78962306a36Sopenharmony_ci return -EPERM; 79062306a36Sopenharmony_ci } 79162306a36Sopenharmony_ci 79262306a36Sopenharmony_ci i_mmap_lock_write(mapping); 79362306a36Sopenharmony_ci 79462306a36Sopenharmony_ci /* If range starts before first full page, zero partial page. */ 79562306a36Sopenharmony_ci if (offset < hole_start) 79662306a36Sopenharmony_ci hugetlbfs_zero_partial_page(h, mapping, 79762306a36Sopenharmony_ci offset, min(offset + len, hole_start)); 79862306a36Sopenharmony_ci 79962306a36Sopenharmony_ci /* Unmap users of full pages in the hole. */ 80062306a36Sopenharmony_ci if (hole_end > hole_start) { 80162306a36Sopenharmony_ci if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) 80262306a36Sopenharmony_ci hugetlb_vmdelete_list(&mapping->i_mmap, 80362306a36Sopenharmony_ci hole_start >> PAGE_SHIFT, 80462306a36Sopenharmony_ci hole_end >> PAGE_SHIFT, 0); 80562306a36Sopenharmony_ci } 80662306a36Sopenharmony_ci 80762306a36Sopenharmony_ci /* If range extends beyond last full page, zero partial page. */ 80862306a36Sopenharmony_ci if ((offset + len) > hole_end && (offset + len) > hole_start) 80962306a36Sopenharmony_ci hugetlbfs_zero_partial_page(h, mapping, 81062306a36Sopenharmony_ci hole_end, offset + len); 81162306a36Sopenharmony_ci 81262306a36Sopenharmony_ci i_mmap_unlock_write(mapping); 81362306a36Sopenharmony_ci 81462306a36Sopenharmony_ci /* Remove full pages from the file. */ 81562306a36Sopenharmony_ci if (hole_end > hole_start) 81662306a36Sopenharmony_ci remove_inode_hugepages(inode, hole_start, hole_end); 81762306a36Sopenharmony_ci 81862306a36Sopenharmony_ci inode_unlock(inode); 81962306a36Sopenharmony_ci 82062306a36Sopenharmony_ci return 0; 82162306a36Sopenharmony_ci} 82262306a36Sopenharmony_ci 82362306a36Sopenharmony_cistatic long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, 82462306a36Sopenharmony_ci loff_t len) 82562306a36Sopenharmony_ci{ 82662306a36Sopenharmony_ci struct inode *inode = file_inode(file); 82762306a36Sopenharmony_ci struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); 82862306a36Sopenharmony_ci struct address_space *mapping = inode->i_mapping; 82962306a36Sopenharmony_ci struct hstate *h = hstate_inode(inode); 83062306a36Sopenharmony_ci struct vm_area_struct pseudo_vma; 83162306a36Sopenharmony_ci struct mm_struct *mm = current->mm; 83262306a36Sopenharmony_ci loff_t hpage_size = huge_page_size(h); 83362306a36Sopenharmony_ci unsigned long hpage_shift = huge_page_shift(h); 83462306a36Sopenharmony_ci pgoff_t start, index, end; 83562306a36Sopenharmony_ci int error; 83662306a36Sopenharmony_ci u32 hash; 83762306a36Sopenharmony_ci 83862306a36Sopenharmony_ci if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 83962306a36Sopenharmony_ci return -EOPNOTSUPP; 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_ci if (mode & FALLOC_FL_PUNCH_HOLE) 84262306a36Sopenharmony_ci return hugetlbfs_punch_hole(inode, offset, len); 84362306a36Sopenharmony_ci 84462306a36Sopenharmony_ci /* 84562306a36Sopenharmony_ci * Default preallocate case. 84662306a36Sopenharmony_ci * For this range, start is rounded down and end is rounded up 84762306a36Sopenharmony_ci * as well as being converted to page offsets. 84862306a36Sopenharmony_ci */ 84962306a36Sopenharmony_ci start = offset >> hpage_shift; 85062306a36Sopenharmony_ci end = (offset + len + hpage_size - 1) >> hpage_shift; 85162306a36Sopenharmony_ci 85262306a36Sopenharmony_ci inode_lock(inode); 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 85562306a36Sopenharmony_ci error = inode_newsize_ok(inode, offset + len); 85662306a36Sopenharmony_ci if (error) 85762306a36Sopenharmony_ci goto out; 85862306a36Sopenharmony_ci 85962306a36Sopenharmony_ci if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 86062306a36Sopenharmony_ci error = -EPERM; 86162306a36Sopenharmony_ci goto out; 86262306a36Sopenharmony_ci } 86362306a36Sopenharmony_ci 86462306a36Sopenharmony_ci /* 86562306a36Sopenharmony_ci * Initialize a pseudo vma as this is required by the huge page 86662306a36Sopenharmony_ci * allocation routines. If NUMA is configured, use page index 86762306a36Sopenharmony_ci * as input to create an allocation policy. 86862306a36Sopenharmony_ci */ 86962306a36Sopenharmony_ci vma_init(&pseudo_vma, mm); 87062306a36Sopenharmony_ci vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); 87162306a36Sopenharmony_ci pseudo_vma.vm_file = file; 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci for (index = start; index < end; index++) { 87462306a36Sopenharmony_ci /* 87562306a36Sopenharmony_ci * This is supposed to be the vaddr where the page is being 87662306a36Sopenharmony_ci * faulted in, but we have no vaddr here. 87762306a36Sopenharmony_ci */ 87862306a36Sopenharmony_ci struct folio *folio; 87962306a36Sopenharmony_ci unsigned long addr; 88062306a36Sopenharmony_ci 88162306a36Sopenharmony_ci cond_resched(); 88262306a36Sopenharmony_ci 88362306a36Sopenharmony_ci /* 88462306a36Sopenharmony_ci * fallocate(2) manpage permits EINTR; we may have been 88562306a36Sopenharmony_ci * interrupted because we are using up too much memory. 88662306a36Sopenharmony_ci */ 88762306a36Sopenharmony_ci if (signal_pending(current)) { 88862306a36Sopenharmony_ci error = -EINTR; 88962306a36Sopenharmony_ci break; 89062306a36Sopenharmony_ci } 89162306a36Sopenharmony_ci 89262306a36Sopenharmony_ci /* addr is the offset within the file (zero based) */ 89362306a36Sopenharmony_ci addr = index * hpage_size; 89462306a36Sopenharmony_ci 89562306a36Sopenharmony_ci /* mutex taken here, fault path and hole punch */ 89662306a36Sopenharmony_ci hash = hugetlb_fault_mutex_hash(mapping, index); 89762306a36Sopenharmony_ci mutex_lock(&hugetlb_fault_mutex_table[hash]); 89862306a36Sopenharmony_ci 89962306a36Sopenharmony_ci /* See if already present in mapping to avoid alloc/free */ 90062306a36Sopenharmony_ci folio = filemap_get_folio(mapping, index); 90162306a36Sopenharmony_ci if (!IS_ERR(folio)) { 90262306a36Sopenharmony_ci folio_put(folio); 90362306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 90462306a36Sopenharmony_ci continue; 90562306a36Sopenharmony_ci } 90662306a36Sopenharmony_ci 90762306a36Sopenharmony_ci /* 90862306a36Sopenharmony_ci * Allocate folio without setting the avoid_reserve argument. 90962306a36Sopenharmony_ci * There certainly are no reserves associated with the 91062306a36Sopenharmony_ci * pseudo_vma. However, there could be shared mappings with 91162306a36Sopenharmony_ci * reserves for the file at the inode level. If we fallocate 91262306a36Sopenharmony_ci * folios in these areas, we need to consume the reserves 91362306a36Sopenharmony_ci * to keep reservation accounting consistent. 91462306a36Sopenharmony_ci */ 91562306a36Sopenharmony_ci hugetlb_set_vma_policy(&pseudo_vma, inode, index); 91662306a36Sopenharmony_ci folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); 91762306a36Sopenharmony_ci hugetlb_drop_vma_policy(&pseudo_vma); 91862306a36Sopenharmony_ci if (IS_ERR(folio)) { 91962306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 92062306a36Sopenharmony_ci error = PTR_ERR(folio); 92162306a36Sopenharmony_ci goto out; 92262306a36Sopenharmony_ci } 92362306a36Sopenharmony_ci clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); 92462306a36Sopenharmony_ci __folio_mark_uptodate(folio); 92562306a36Sopenharmony_ci error = hugetlb_add_to_page_cache(folio, mapping, index); 92662306a36Sopenharmony_ci if (unlikely(error)) { 92762306a36Sopenharmony_ci restore_reserve_on_error(h, &pseudo_vma, addr, folio); 92862306a36Sopenharmony_ci folio_put(folio); 92962306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 93062306a36Sopenharmony_ci goto out; 93162306a36Sopenharmony_ci } 93262306a36Sopenharmony_ci 93362306a36Sopenharmony_ci mutex_unlock(&hugetlb_fault_mutex_table[hash]); 93462306a36Sopenharmony_ci 93562306a36Sopenharmony_ci folio_set_hugetlb_migratable(folio); 93662306a36Sopenharmony_ci /* 93762306a36Sopenharmony_ci * folio_unlock because locked by hugetlb_add_to_page_cache() 93862306a36Sopenharmony_ci * folio_put() due to reference from alloc_hugetlb_folio() 93962306a36Sopenharmony_ci */ 94062306a36Sopenharmony_ci folio_unlock(folio); 94162306a36Sopenharmony_ci folio_put(folio); 94262306a36Sopenharmony_ci } 94362306a36Sopenharmony_ci 94462306a36Sopenharmony_ci if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 94562306a36Sopenharmony_ci i_size_write(inode, offset + len); 94662306a36Sopenharmony_ci inode_set_ctime_current(inode); 94762306a36Sopenharmony_ciout: 94862306a36Sopenharmony_ci inode_unlock(inode); 94962306a36Sopenharmony_ci return error; 95062306a36Sopenharmony_ci} 95162306a36Sopenharmony_ci 95262306a36Sopenharmony_cistatic int hugetlbfs_setattr(struct mnt_idmap *idmap, 95362306a36Sopenharmony_ci struct dentry *dentry, struct iattr *attr) 95462306a36Sopenharmony_ci{ 95562306a36Sopenharmony_ci struct inode *inode = d_inode(dentry); 95662306a36Sopenharmony_ci struct hstate *h = hstate_inode(inode); 95762306a36Sopenharmony_ci int error; 95862306a36Sopenharmony_ci unsigned int ia_valid = attr->ia_valid; 95962306a36Sopenharmony_ci struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); 96062306a36Sopenharmony_ci 96162306a36Sopenharmony_ci error = setattr_prepare(&nop_mnt_idmap, dentry, attr); 96262306a36Sopenharmony_ci if (error) 96362306a36Sopenharmony_ci return error; 96462306a36Sopenharmony_ci 96562306a36Sopenharmony_ci if (ia_valid & ATTR_SIZE) { 96662306a36Sopenharmony_ci loff_t oldsize = inode->i_size; 96762306a36Sopenharmony_ci loff_t newsize = attr->ia_size; 96862306a36Sopenharmony_ci 96962306a36Sopenharmony_ci if (newsize & ~huge_page_mask(h)) 97062306a36Sopenharmony_ci return -EINVAL; 97162306a36Sopenharmony_ci /* protected by i_rwsem */ 97262306a36Sopenharmony_ci if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 97362306a36Sopenharmony_ci (newsize > oldsize && (info->seals & F_SEAL_GROW))) 97462306a36Sopenharmony_ci return -EPERM; 97562306a36Sopenharmony_ci hugetlb_vmtruncate(inode, newsize); 97662306a36Sopenharmony_ci } 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_ci setattr_copy(&nop_mnt_idmap, inode, attr); 97962306a36Sopenharmony_ci mark_inode_dirty(inode); 98062306a36Sopenharmony_ci return 0; 98162306a36Sopenharmony_ci} 98262306a36Sopenharmony_ci 98362306a36Sopenharmony_cistatic struct inode *hugetlbfs_get_root(struct super_block *sb, 98462306a36Sopenharmony_ci struct hugetlbfs_fs_context *ctx) 98562306a36Sopenharmony_ci{ 98662306a36Sopenharmony_ci struct inode *inode; 98762306a36Sopenharmony_ci 98862306a36Sopenharmony_ci inode = new_inode(sb); 98962306a36Sopenharmony_ci if (inode) { 99062306a36Sopenharmony_ci inode->i_ino = get_next_ino(); 99162306a36Sopenharmony_ci inode->i_mode = S_IFDIR | ctx->mode; 99262306a36Sopenharmony_ci inode->i_uid = ctx->uid; 99362306a36Sopenharmony_ci inode->i_gid = ctx->gid; 99462306a36Sopenharmony_ci inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); 99562306a36Sopenharmony_ci inode->i_op = &hugetlbfs_dir_inode_operations; 99662306a36Sopenharmony_ci inode->i_fop = &simple_dir_operations; 99762306a36Sopenharmony_ci /* directory inodes start off with i_nlink == 2 (for "." entry) */ 99862306a36Sopenharmony_ci inc_nlink(inode); 99962306a36Sopenharmony_ci lockdep_annotate_inode_mutex_key(inode); 100062306a36Sopenharmony_ci } 100162306a36Sopenharmony_ci return inode; 100262306a36Sopenharmony_ci} 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_ci/* 100562306a36Sopenharmony_ci * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never 100662306a36Sopenharmony_ci * be taken from reclaim -- unlike regular filesystems. This needs an 100762306a36Sopenharmony_ci * annotation because huge_pmd_share() does an allocation under hugetlb's 100862306a36Sopenharmony_ci * i_mmap_rwsem. 100962306a36Sopenharmony_ci */ 101062306a36Sopenharmony_cistatic struct lock_class_key hugetlbfs_i_mmap_rwsem_key; 101162306a36Sopenharmony_ci 101262306a36Sopenharmony_cistatic struct inode *hugetlbfs_get_inode(struct super_block *sb, 101362306a36Sopenharmony_ci struct inode *dir, 101462306a36Sopenharmony_ci umode_t mode, dev_t dev) 101562306a36Sopenharmony_ci{ 101662306a36Sopenharmony_ci struct inode *inode; 101762306a36Sopenharmony_ci struct resv_map *resv_map = NULL; 101862306a36Sopenharmony_ci 101962306a36Sopenharmony_ci /* 102062306a36Sopenharmony_ci * Reserve maps are only needed for inodes that can have associated 102162306a36Sopenharmony_ci * page allocations. 102262306a36Sopenharmony_ci */ 102362306a36Sopenharmony_ci if (S_ISREG(mode) || S_ISLNK(mode)) { 102462306a36Sopenharmony_ci resv_map = resv_map_alloc(); 102562306a36Sopenharmony_ci if (!resv_map) 102662306a36Sopenharmony_ci return NULL; 102762306a36Sopenharmony_ci } 102862306a36Sopenharmony_ci 102962306a36Sopenharmony_ci inode = new_inode(sb); 103062306a36Sopenharmony_ci if (inode) { 103162306a36Sopenharmony_ci struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); 103262306a36Sopenharmony_ci 103362306a36Sopenharmony_ci inode->i_ino = get_next_ino(); 103462306a36Sopenharmony_ci inode_init_owner(&nop_mnt_idmap, inode, dir, mode); 103562306a36Sopenharmony_ci lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, 103662306a36Sopenharmony_ci &hugetlbfs_i_mmap_rwsem_key); 103762306a36Sopenharmony_ci inode->i_mapping->a_ops = &hugetlbfs_aops; 103862306a36Sopenharmony_ci inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); 103962306a36Sopenharmony_ci inode->i_mapping->private_data = resv_map; 104062306a36Sopenharmony_ci info->seals = F_SEAL_SEAL; 104162306a36Sopenharmony_ci switch (mode & S_IFMT) { 104262306a36Sopenharmony_ci default: 104362306a36Sopenharmony_ci init_special_inode(inode, mode, dev); 104462306a36Sopenharmony_ci break; 104562306a36Sopenharmony_ci case S_IFREG: 104662306a36Sopenharmony_ci inode->i_op = &hugetlbfs_inode_operations; 104762306a36Sopenharmony_ci inode->i_fop = &hugetlbfs_file_operations; 104862306a36Sopenharmony_ci break; 104962306a36Sopenharmony_ci case S_IFDIR: 105062306a36Sopenharmony_ci inode->i_op = &hugetlbfs_dir_inode_operations; 105162306a36Sopenharmony_ci inode->i_fop = &simple_dir_operations; 105262306a36Sopenharmony_ci 105362306a36Sopenharmony_ci /* directory inodes start off with i_nlink == 2 (for "." entry) */ 105462306a36Sopenharmony_ci inc_nlink(inode); 105562306a36Sopenharmony_ci break; 105662306a36Sopenharmony_ci case S_IFLNK: 105762306a36Sopenharmony_ci inode->i_op = &page_symlink_inode_operations; 105862306a36Sopenharmony_ci inode_nohighmem(inode); 105962306a36Sopenharmony_ci break; 106062306a36Sopenharmony_ci } 106162306a36Sopenharmony_ci lockdep_annotate_inode_mutex_key(inode); 106262306a36Sopenharmony_ci } else { 106362306a36Sopenharmony_ci if (resv_map) 106462306a36Sopenharmony_ci kref_put(&resv_map->refs, resv_map_release); 106562306a36Sopenharmony_ci } 106662306a36Sopenharmony_ci 106762306a36Sopenharmony_ci return inode; 106862306a36Sopenharmony_ci} 106962306a36Sopenharmony_ci 107062306a36Sopenharmony_ci/* 107162306a36Sopenharmony_ci * File creation. Allocate an inode, and we're done.. 107262306a36Sopenharmony_ci */ 107362306a36Sopenharmony_cistatic int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, 107462306a36Sopenharmony_ci struct dentry *dentry, umode_t mode, dev_t dev) 107562306a36Sopenharmony_ci{ 107662306a36Sopenharmony_ci struct inode *inode; 107762306a36Sopenharmony_ci 107862306a36Sopenharmony_ci inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); 107962306a36Sopenharmony_ci if (!inode) 108062306a36Sopenharmony_ci return -ENOSPC; 108162306a36Sopenharmony_ci dir->i_mtime = inode_set_ctime_current(dir); 108262306a36Sopenharmony_ci d_instantiate(dentry, inode); 108362306a36Sopenharmony_ci dget(dentry);/* Extra count - pin the dentry in core */ 108462306a36Sopenharmony_ci return 0; 108562306a36Sopenharmony_ci} 108662306a36Sopenharmony_ci 108762306a36Sopenharmony_cistatic int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, 108862306a36Sopenharmony_ci struct dentry *dentry, umode_t mode) 108962306a36Sopenharmony_ci{ 109062306a36Sopenharmony_ci int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, 109162306a36Sopenharmony_ci mode | S_IFDIR, 0); 109262306a36Sopenharmony_ci if (!retval) 109362306a36Sopenharmony_ci inc_nlink(dir); 109462306a36Sopenharmony_ci return retval; 109562306a36Sopenharmony_ci} 109662306a36Sopenharmony_ci 109762306a36Sopenharmony_cistatic int hugetlbfs_create(struct mnt_idmap *idmap, 109862306a36Sopenharmony_ci struct inode *dir, struct dentry *dentry, 109962306a36Sopenharmony_ci umode_t mode, bool excl) 110062306a36Sopenharmony_ci{ 110162306a36Sopenharmony_ci return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); 110262306a36Sopenharmony_ci} 110362306a36Sopenharmony_ci 110462306a36Sopenharmony_cistatic int hugetlbfs_tmpfile(struct mnt_idmap *idmap, 110562306a36Sopenharmony_ci struct inode *dir, struct file *file, 110662306a36Sopenharmony_ci umode_t mode) 110762306a36Sopenharmony_ci{ 110862306a36Sopenharmony_ci struct inode *inode; 110962306a36Sopenharmony_ci 111062306a36Sopenharmony_ci inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); 111162306a36Sopenharmony_ci if (!inode) 111262306a36Sopenharmony_ci return -ENOSPC; 111362306a36Sopenharmony_ci dir->i_mtime = inode_set_ctime_current(dir); 111462306a36Sopenharmony_ci d_tmpfile(file, inode); 111562306a36Sopenharmony_ci return finish_open_simple(file, 0); 111662306a36Sopenharmony_ci} 111762306a36Sopenharmony_ci 111862306a36Sopenharmony_cistatic int hugetlbfs_symlink(struct mnt_idmap *idmap, 111962306a36Sopenharmony_ci struct inode *dir, struct dentry *dentry, 112062306a36Sopenharmony_ci const char *symname) 112162306a36Sopenharmony_ci{ 112262306a36Sopenharmony_ci struct inode *inode; 112362306a36Sopenharmony_ci int error = -ENOSPC; 112462306a36Sopenharmony_ci 112562306a36Sopenharmony_ci inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); 112662306a36Sopenharmony_ci if (inode) { 112762306a36Sopenharmony_ci int l = strlen(symname)+1; 112862306a36Sopenharmony_ci error = page_symlink(inode, symname, l); 112962306a36Sopenharmony_ci if (!error) { 113062306a36Sopenharmony_ci d_instantiate(dentry, inode); 113162306a36Sopenharmony_ci dget(dentry); 113262306a36Sopenharmony_ci } else 113362306a36Sopenharmony_ci iput(inode); 113462306a36Sopenharmony_ci } 113562306a36Sopenharmony_ci dir->i_mtime = inode_set_ctime_current(dir); 113662306a36Sopenharmony_ci 113762306a36Sopenharmony_ci return error; 113862306a36Sopenharmony_ci} 113962306a36Sopenharmony_ci 114062306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION 114162306a36Sopenharmony_cistatic int hugetlbfs_migrate_folio(struct address_space *mapping, 114262306a36Sopenharmony_ci struct folio *dst, struct folio *src, 114362306a36Sopenharmony_ci enum migrate_mode mode) 114462306a36Sopenharmony_ci{ 114562306a36Sopenharmony_ci int rc; 114662306a36Sopenharmony_ci 114762306a36Sopenharmony_ci rc = migrate_huge_page_move_mapping(mapping, dst, src); 114862306a36Sopenharmony_ci if (rc != MIGRATEPAGE_SUCCESS) 114962306a36Sopenharmony_ci return rc; 115062306a36Sopenharmony_ci 115162306a36Sopenharmony_ci if (hugetlb_folio_subpool(src)) { 115262306a36Sopenharmony_ci hugetlb_set_folio_subpool(dst, 115362306a36Sopenharmony_ci hugetlb_folio_subpool(src)); 115462306a36Sopenharmony_ci hugetlb_set_folio_subpool(src, NULL); 115562306a36Sopenharmony_ci } 115662306a36Sopenharmony_ci 115762306a36Sopenharmony_ci if (mode != MIGRATE_SYNC_NO_COPY) 115862306a36Sopenharmony_ci folio_migrate_copy(dst, src); 115962306a36Sopenharmony_ci else 116062306a36Sopenharmony_ci folio_migrate_flags(dst, src); 116162306a36Sopenharmony_ci 116262306a36Sopenharmony_ci return MIGRATEPAGE_SUCCESS; 116362306a36Sopenharmony_ci} 116462306a36Sopenharmony_ci#else 116562306a36Sopenharmony_ci#define hugetlbfs_migrate_folio NULL 116662306a36Sopenharmony_ci#endif 116762306a36Sopenharmony_ci 116862306a36Sopenharmony_cistatic int hugetlbfs_error_remove_page(struct address_space *mapping, 116962306a36Sopenharmony_ci struct page *page) 117062306a36Sopenharmony_ci{ 117162306a36Sopenharmony_ci return 0; 117262306a36Sopenharmony_ci} 117362306a36Sopenharmony_ci 117462306a36Sopenharmony_ci/* 117562306a36Sopenharmony_ci * Display the mount options in /proc/mounts. 117662306a36Sopenharmony_ci */ 117762306a36Sopenharmony_cistatic int hugetlbfs_show_options(struct seq_file *m, struct dentry *root) 117862306a36Sopenharmony_ci{ 117962306a36Sopenharmony_ci struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb); 118062306a36Sopenharmony_ci struct hugepage_subpool *spool = sbinfo->spool; 118162306a36Sopenharmony_ci unsigned long hpage_size = huge_page_size(sbinfo->hstate); 118262306a36Sopenharmony_ci unsigned hpage_shift = huge_page_shift(sbinfo->hstate); 118362306a36Sopenharmony_ci char mod; 118462306a36Sopenharmony_ci 118562306a36Sopenharmony_ci if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 118662306a36Sopenharmony_ci seq_printf(m, ",uid=%u", 118762306a36Sopenharmony_ci from_kuid_munged(&init_user_ns, sbinfo->uid)); 118862306a36Sopenharmony_ci if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 118962306a36Sopenharmony_ci seq_printf(m, ",gid=%u", 119062306a36Sopenharmony_ci from_kgid_munged(&init_user_ns, sbinfo->gid)); 119162306a36Sopenharmony_ci if (sbinfo->mode != 0755) 119262306a36Sopenharmony_ci seq_printf(m, ",mode=%o", sbinfo->mode); 119362306a36Sopenharmony_ci if (sbinfo->max_inodes != -1) 119462306a36Sopenharmony_ci seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes); 119562306a36Sopenharmony_ci 119662306a36Sopenharmony_ci hpage_size /= 1024; 119762306a36Sopenharmony_ci mod = 'K'; 119862306a36Sopenharmony_ci if (hpage_size >= 1024) { 119962306a36Sopenharmony_ci hpage_size /= 1024; 120062306a36Sopenharmony_ci mod = 'M'; 120162306a36Sopenharmony_ci } 120262306a36Sopenharmony_ci seq_printf(m, ",pagesize=%lu%c", hpage_size, mod); 120362306a36Sopenharmony_ci if (spool) { 120462306a36Sopenharmony_ci if (spool->max_hpages != -1) 120562306a36Sopenharmony_ci seq_printf(m, ",size=%llu", 120662306a36Sopenharmony_ci (unsigned long long)spool->max_hpages << hpage_shift); 120762306a36Sopenharmony_ci if (spool->min_hpages != -1) 120862306a36Sopenharmony_ci seq_printf(m, ",min_size=%llu", 120962306a36Sopenharmony_ci (unsigned long long)spool->min_hpages << hpage_shift); 121062306a36Sopenharmony_ci } 121162306a36Sopenharmony_ci return 0; 121262306a36Sopenharmony_ci} 121362306a36Sopenharmony_ci 121462306a36Sopenharmony_cistatic int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 121562306a36Sopenharmony_ci{ 121662306a36Sopenharmony_ci struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 121762306a36Sopenharmony_ci struct hstate *h = hstate_inode(d_inode(dentry)); 121862306a36Sopenharmony_ci 121962306a36Sopenharmony_ci buf->f_type = HUGETLBFS_MAGIC; 122062306a36Sopenharmony_ci buf->f_bsize = huge_page_size(h); 122162306a36Sopenharmony_ci if (sbinfo) { 122262306a36Sopenharmony_ci spin_lock(&sbinfo->stat_lock); 122362306a36Sopenharmony_ci /* If no limits set, just report 0 or -1 for max/free/used 122462306a36Sopenharmony_ci * blocks, like simple_statfs() */ 122562306a36Sopenharmony_ci if (sbinfo->spool) { 122662306a36Sopenharmony_ci long free_pages; 122762306a36Sopenharmony_ci 122862306a36Sopenharmony_ci spin_lock_irq(&sbinfo->spool->lock); 122962306a36Sopenharmony_ci buf->f_blocks = sbinfo->spool->max_hpages; 123062306a36Sopenharmony_ci free_pages = sbinfo->spool->max_hpages 123162306a36Sopenharmony_ci - sbinfo->spool->used_hpages; 123262306a36Sopenharmony_ci buf->f_bavail = buf->f_bfree = free_pages; 123362306a36Sopenharmony_ci spin_unlock_irq(&sbinfo->spool->lock); 123462306a36Sopenharmony_ci buf->f_files = sbinfo->max_inodes; 123562306a36Sopenharmony_ci buf->f_ffree = sbinfo->free_inodes; 123662306a36Sopenharmony_ci } 123762306a36Sopenharmony_ci spin_unlock(&sbinfo->stat_lock); 123862306a36Sopenharmony_ci } 123962306a36Sopenharmony_ci buf->f_namelen = NAME_MAX; 124062306a36Sopenharmony_ci return 0; 124162306a36Sopenharmony_ci} 124262306a36Sopenharmony_ci 124362306a36Sopenharmony_cistatic void hugetlbfs_put_super(struct super_block *sb) 124462306a36Sopenharmony_ci{ 124562306a36Sopenharmony_ci struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); 124662306a36Sopenharmony_ci 124762306a36Sopenharmony_ci if (sbi) { 124862306a36Sopenharmony_ci sb->s_fs_info = NULL; 124962306a36Sopenharmony_ci 125062306a36Sopenharmony_ci if (sbi->spool) 125162306a36Sopenharmony_ci hugepage_put_subpool(sbi->spool); 125262306a36Sopenharmony_ci 125362306a36Sopenharmony_ci kfree(sbi); 125462306a36Sopenharmony_ci } 125562306a36Sopenharmony_ci} 125662306a36Sopenharmony_ci 125762306a36Sopenharmony_cistatic inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) 125862306a36Sopenharmony_ci{ 125962306a36Sopenharmony_ci if (sbinfo->free_inodes >= 0) { 126062306a36Sopenharmony_ci spin_lock(&sbinfo->stat_lock); 126162306a36Sopenharmony_ci if (unlikely(!sbinfo->free_inodes)) { 126262306a36Sopenharmony_ci spin_unlock(&sbinfo->stat_lock); 126362306a36Sopenharmony_ci return 0; 126462306a36Sopenharmony_ci } 126562306a36Sopenharmony_ci sbinfo->free_inodes--; 126662306a36Sopenharmony_ci spin_unlock(&sbinfo->stat_lock); 126762306a36Sopenharmony_ci } 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_ci return 1; 127062306a36Sopenharmony_ci} 127162306a36Sopenharmony_ci 127262306a36Sopenharmony_cistatic void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) 127362306a36Sopenharmony_ci{ 127462306a36Sopenharmony_ci if (sbinfo->free_inodes >= 0) { 127562306a36Sopenharmony_ci spin_lock(&sbinfo->stat_lock); 127662306a36Sopenharmony_ci sbinfo->free_inodes++; 127762306a36Sopenharmony_ci spin_unlock(&sbinfo->stat_lock); 127862306a36Sopenharmony_ci } 127962306a36Sopenharmony_ci} 128062306a36Sopenharmony_ci 128162306a36Sopenharmony_ci 128262306a36Sopenharmony_cistatic struct kmem_cache *hugetlbfs_inode_cachep; 128362306a36Sopenharmony_ci 128462306a36Sopenharmony_cistatic struct inode *hugetlbfs_alloc_inode(struct super_block *sb) 128562306a36Sopenharmony_ci{ 128662306a36Sopenharmony_ci struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); 128762306a36Sopenharmony_ci struct hugetlbfs_inode_info *p; 128862306a36Sopenharmony_ci 128962306a36Sopenharmony_ci if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) 129062306a36Sopenharmony_ci return NULL; 129162306a36Sopenharmony_ci p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL); 129262306a36Sopenharmony_ci if (unlikely(!p)) { 129362306a36Sopenharmony_ci hugetlbfs_inc_free_inodes(sbinfo); 129462306a36Sopenharmony_ci return NULL; 129562306a36Sopenharmony_ci } 129662306a36Sopenharmony_ci 129762306a36Sopenharmony_ci /* 129862306a36Sopenharmony_ci * Any time after allocation, hugetlbfs_destroy_inode can be called 129962306a36Sopenharmony_ci * for the inode. mpol_free_shared_policy is unconditionally called 130062306a36Sopenharmony_ci * as part of hugetlbfs_destroy_inode. So, initialize policy here 130162306a36Sopenharmony_ci * in case of a quick call to destroy. 130262306a36Sopenharmony_ci * 130362306a36Sopenharmony_ci * Note that the policy is initialized even if we are creating a 130462306a36Sopenharmony_ci * private inode. This simplifies hugetlbfs_destroy_inode. 130562306a36Sopenharmony_ci */ 130662306a36Sopenharmony_ci mpol_shared_policy_init(&p->policy, NULL); 130762306a36Sopenharmony_ci 130862306a36Sopenharmony_ci return &p->vfs_inode; 130962306a36Sopenharmony_ci} 131062306a36Sopenharmony_ci 131162306a36Sopenharmony_cistatic void hugetlbfs_free_inode(struct inode *inode) 131262306a36Sopenharmony_ci{ 131362306a36Sopenharmony_ci kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); 131462306a36Sopenharmony_ci} 131562306a36Sopenharmony_ci 131662306a36Sopenharmony_cistatic void hugetlbfs_destroy_inode(struct inode *inode) 131762306a36Sopenharmony_ci{ 131862306a36Sopenharmony_ci hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); 131962306a36Sopenharmony_ci mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); 132062306a36Sopenharmony_ci} 132162306a36Sopenharmony_ci 132262306a36Sopenharmony_cistatic const struct address_space_operations hugetlbfs_aops = { 132362306a36Sopenharmony_ci .write_begin = hugetlbfs_write_begin, 132462306a36Sopenharmony_ci .write_end = hugetlbfs_write_end, 132562306a36Sopenharmony_ci .dirty_folio = noop_dirty_folio, 132662306a36Sopenharmony_ci .migrate_folio = hugetlbfs_migrate_folio, 132762306a36Sopenharmony_ci .error_remove_page = hugetlbfs_error_remove_page, 132862306a36Sopenharmony_ci}; 132962306a36Sopenharmony_ci 133062306a36Sopenharmony_ci 133162306a36Sopenharmony_cistatic void init_once(void *foo) 133262306a36Sopenharmony_ci{ 133362306a36Sopenharmony_ci struct hugetlbfs_inode_info *ei = foo; 133462306a36Sopenharmony_ci 133562306a36Sopenharmony_ci inode_init_once(&ei->vfs_inode); 133662306a36Sopenharmony_ci} 133762306a36Sopenharmony_ci 133862306a36Sopenharmony_ciconst struct file_operations hugetlbfs_file_operations = { 133962306a36Sopenharmony_ci .read_iter = hugetlbfs_read_iter, 134062306a36Sopenharmony_ci .mmap = hugetlbfs_file_mmap, 134162306a36Sopenharmony_ci .fsync = noop_fsync, 134262306a36Sopenharmony_ci .get_unmapped_area = hugetlb_get_unmapped_area, 134362306a36Sopenharmony_ci .llseek = default_llseek, 134462306a36Sopenharmony_ci .fallocate = hugetlbfs_fallocate, 134562306a36Sopenharmony_ci}; 134662306a36Sopenharmony_ci 134762306a36Sopenharmony_cistatic const struct inode_operations hugetlbfs_dir_inode_operations = { 134862306a36Sopenharmony_ci .create = hugetlbfs_create, 134962306a36Sopenharmony_ci .lookup = simple_lookup, 135062306a36Sopenharmony_ci .link = simple_link, 135162306a36Sopenharmony_ci .unlink = simple_unlink, 135262306a36Sopenharmony_ci .symlink = hugetlbfs_symlink, 135362306a36Sopenharmony_ci .mkdir = hugetlbfs_mkdir, 135462306a36Sopenharmony_ci .rmdir = simple_rmdir, 135562306a36Sopenharmony_ci .mknod = hugetlbfs_mknod, 135662306a36Sopenharmony_ci .rename = simple_rename, 135762306a36Sopenharmony_ci .setattr = hugetlbfs_setattr, 135862306a36Sopenharmony_ci .tmpfile = hugetlbfs_tmpfile, 135962306a36Sopenharmony_ci}; 136062306a36Sopenharmony_ci 136162306a36Sopenharmony_cistatic const struct inode_operations hugetlbfs_inode_operations = { 136262306a36Sopenharmony_ci .setattr = hugetlbfs_setattr, 136362306a36Sopenharmony_ci}; 136462306a36Sopenharmony_ci 136562306a36Sopenharmony_cistatic const struct super_operations hugetlbfs_ops = { 136662306a36Sopenharmony_ci .alloc_inode = hugetlbfs_alloc_inode, 136762306a36Sopenharmony_ci .free_inode = hugetlbfs_free_inode, 136862306a36Sopenharmony_ci .destroy_inode = hugetlbfs_destroy_inode, 136962306a36Sopenharmony_ci .evict_inode = hugetlbfs_evict_inode, 137062306a36Sopenharmony_ci .statfs = hugetlbfs_statfs, 137162306a36Sopenharmony_ci .put_super = hugetlbfs_put_super, 137262306a36Sopenharmony_ci .show_options = hugetlbfs_show_options, 137362306a36Sopenharmony_ci}; 137462306a36Sopenharmony_ci 137562306a36Sopenharmony_ci/* 137662306a36Sopenharmony_ci * Convert size option passed from command line to number of huge pages 137762306a36Sopenharmony_ci * in the pool specified by hstate. Size option could be in bytes 137862306a36Sopenharmony_ci * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). 137962306a36Sopenharmony_ci */ 138062306a36Sopenharmony_cistatic long 138162306a36Sopenharmony_cihugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, 138262306a36Sopenharmony_ci enum hugetlbfs_size_type val_type) 138362306a36Sopenharmony_ci{ 138462306a36Sopenharmony_ci if (val_type == NO_SIZE) 138562306a36Sopenharmony_ci return -1; 138662306a36Sopenharmony_ci 138762306a36Sopenharmony_ci if (val_type == SIZE_PERCENT) { 138862306a36Sopenharmony_ci size_opt <<= huge_page_shift(h); 138962306a36Sopenharmony_ci size_opt *= h->max_huge_pages; 139062306a36Sopenharmony_ci do_div(size_opt, 100); 139162306a36Sopenharmony_ci } 139262306a36Sopenharmony_ci 139362306a36Sopenharmony_ci size_opt >>= huge_page_shift(h); 139462306a36Sopenharmony_ci return size_opt; 139562306a36Sopenharmony_ci} 139662306a36Sopenharmony_ci 139762306a36Sopenharmony_ci/* 139862306a36Sopenharmony_ci * Parse one mount parameter. 139962306a36Sopenharmony_ci */ 140062306a36Sopenharmony_cistatic int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param) 140162306a36Sopenharmony_ci{ 140262306a36Sopenharmony_ci struct hugetlbfs_fs_context *ctx = fc->fs_private; 140362306a36Sopenharmony_ci struct fs_parse_result result; 140462306a36Sopenharmony_ci struct hstate *h; 140562306a36Sopenharmony_ci char *rest; 140662306a36Sopenharmony_ci unsigned long ps; 140762306a36Sopenharmony_ci int opt; 140862306a36Sopenharmony_ci 140962306a36Sopenharmony_ci opt = fs_parse(fc, hugetlb_fs_parameters, param, &result); 141062306a36Sopenharmony_ci if (opt < 0) 141162306a36Sopenharmony_ci return opt; 141262306a36Sopenharmony_ci 141362306a36Sopenharmony_ci switch (opt) { 141462306a36Sopenharmony_ci case Opt_uid: 141562306a36Sopenharmony_ci ctx->uid = make_kuid(current_user_ns(), result.uint_32); 141662306a36Sopenharmony_ci if (!uid_valid(ctx->uid)) 141762306a36Sopenharmony_ci goto bad_val; 141862306a36Sopenharmony_ci return 0; 141962306a36Sopenharmony_ci 142062306a36Sopenharmony_ci case Opt_gid: 142162306a36Sopenharmony_ci ctx->gid = make_kgid(current_user_ns(), result.uint_32); 142262306a36Sopenharmony_ci if (!gid_valid(ctx->gid)) 142362306a36Sopenharmony_ci goto bad_val; 142462306a36Sopenharmony_ci return 0; 142562306a36Sopenharmony_ci 142662306a36Sopenharmony_ci case Opt_mode: 142762306a36Sopenharmony_ci ctx->mode = result.uint_32 & 01777U; 142862306a36Sopenharmony_ci return 0; 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_ci case Opt_size: 143162306a36Sopenharmony_ci /* memparse() will accept a K/M/G without a digit */ 143262306a36Sopenharmony_ci if (!param->string || !isdigit(param->string[0])) 143362306a36Sopenharmony_ci goto bad_val; 143462306a36Sopenharmony_ci ctx->max_size_opt = memparse(param->string, &rest); 143562306a36Sopenharmony_ci ctx->max_val_type = SIZE_STD; 143662306a36Sopenharmony_ci if (*rest == '%') 143762306a36Sopenharmony_ci ctx->max_val_type = SIZE_PERCENT; 143862306a36Sopenharmony_ci return 0; 143962306a36Sopenharmony_ci 144062306a36Sopenharmony_ci case Opt_nr_inodes: 144162306a36Sopenharmony_ci /* memparse() will accept a K/M/G without a digit */ 144262306a36Sopenharmony_ci if (!param->string || !isdigit(param->string[0])) 144362306a36Sopenharmony_ci goto bad_val; 144462306a36Sopenharmony_ci ctx->nr_inodes = memparse(param->string, &rest); 144562306a36Sopenharmony_ci return 0; 144662306a36Sopenharmony_ci 144762306a36Sopenharmony_ci case Opt_pagesize: 144862306a36Sopenharmony_ci ps = memparse(param->string, &rest); 144962306a36Sopenharmony_ci h = size_to_hstate(ps); 145062306a36Sopenharmony_ci if (!h) { 145162306a36Sopenharmony_ci pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); 145262306a36Sopenharmony_ci return -EINVAL; 145362306a36Sopenharmony_ci } 145462306a36Sopenharmony_ci ctx->hstate = h; 145562306a36Sopenharmony_ci return 0; 145662306a36Sopenharmony_ci 145762306a36Sopenharmony_ci case Opt_min_size: 145862306a36Sopenharmony_ci /* memparse() will accept a K/M/G without a digit */ 145962306a36Sopenharmony_ci if (!param->string || !isdigit(param->string[0])) 146062306a36Sopenharmony_ci goto bad_val; 146162306a36Sopenharmony_ci ctx->min_size_opt = memparse(param->string, &rest); 146262306a36Sopenharmony_ci ctx->min_val_type = SIZE_STD; 146362306a36Sopenharmony_ci if (*rest == '%') 146462306a36Sopenharmony_ci ctx->min_val_type = SIZE_PERCENT; 146562306a36Sopenharmony_ci return 0; 146662306a36Sopenharmony_ci 146762306a36Sopenharmony_ci default: 146862306a36Sopenharmony_ci return -EINVAL; 146962306a36Sopenharmony_ci } 147062306a36Sopenharmony_ci 147162306a36Sopenharmony_cibad_val: 147262306a36Sopenharmony_ci return invalfc(fc, "Bad value '%s' for mount option '%s'\n", 147362306a36Sopenharmony_ci param->string, param->key); 147462306a36Sopenharmony_ci} 147562306a36Sopenharmony_ci 147662306a36Sopenharmony_ci/* 147762306a36Sopenharmony_ci * Validate the parsed options. 147862306a36Sopenharmony_ci */ 147962306a36Sopenharmony_cistatic int hugetlbfs_validate(struct fs_context *fc) 148062306a36Sopenharmony_ci{ 148162306a36Sopenharmony_ci struct hugetlbfs_fs_context *ctx = fc->fs_private; 148262306a36Sopenharmony_ci 148362306a36Sopenharmony_ci /* 148462306a36Sopenharmony_ci * Use huge page pool size (in hstate) to convert the size 148562306a36Sopenharmony_ci * options to number of huge pages. If NO_SIZE, -1 is returned. 148662306a36Sopenharmony_ci */ 148762306a36Sopenharmony_ci ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate, 148862306a36Sopenharmony_ci ctx->max_size_opt, 148962306a36Sopenharmony_ci ctx->max_val_type); 149062306a36Sopenharmony_ci ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate, 149162306a36Sopenharmony_ci ctx->min_size_opt, 149262306a36Sopenharmony_ci ctx->min_val_type); 149362306a36Sopenharmony_ci 149462306a36Sopenharmony_ci /* 149562306a36Sopenharmony_ci * If max_size was specified, then min_size must be smaller 149662306a36Sopenharmony_ci */ 149762306a36Sopenharmony_ci if (ctx->max_val_type > NO_SIZE && 149862306a36Sopenharmony_ci ctx->min_hpages > ctx->max_hpages) { 149962306a36Sopenharmony_ci pr_err("Minimum size can not be greater than maximum size\n"); 150062306a36Sopenharmony_ci return -EINVAL; 150162306a36Sopenharmony_ci } 150262306a36Sopenharmony_ci 150362306a36Sopenharmony_ci return 0; 150462306a36Sopenharmony_ci} 150562306a36Sopenharmony_ci 150662306a36Sopenharmony_cistatic int 150762306a36Sopenharmony_cihugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) 150862306a36Sopenharmony_ci{ 150962306a36Sopenharmony_ci struct hugetlbfs_fs_context *ctx = fc->fs_private; 151062306a36Sopenharmony_ci struct hugetlbfs_sb_info *sbinfo; 151162306a36Sopenharmony_ci 151262306a36Sopenharmony_ci sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); 151362306a36Sopenharmony_ci if (!sbinfo) 151462306a36Sopenharmony_ci return -ENOMEM; 151562306a36Sopenharmony_ci sb->s_fs_info = sbinfo; 151662306a36Sopenharmony_ci spin_lock_init(&sbinfo->stat_lock); 151762306a36Sopenharmony_ci sbinfo->hstate = ctx->hstate; 151862306a36Sopenharmony_ci sbinfo->max_inodes = ctx->nr_inodes; 151962306a36Sopenharmony_ci sbinfo->free_inodes = ctx->nr_inodes; 152062306a36Sopenharmony_ci sbinfo->spool = NULL; 152162306a36Sopenharmony_ci sbinfo->uid = ctx->uid; 152262306a36Sopenharmony_ci sbinfo->gid = ctx->gid; 152362306a36Sopenharmony_ci sbinfo->mode = ctx->mode; 152462306a36Sopenharmony_ci 152562306a36Sopenharmony_ci /* 152662306a36Sopenharmony_ci * Allocate and initialize subpool if maximum or minimum size is 152762306a36Sopenharmony_ci * specified. Any needed reservations (for minimum size) are taken 152862306a36Sopenharmony_ci * when the subpool is created. 152962306a36Sopenharmony_ci */ 153062306a36Sopenharmony_ci if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { 153162306a36Sopenharmony_ci sbinfo->spool = hugepage_new_subpool(ctx->hstate, 153262306a36Sopenharmony_ci ctx->max_hpages, 153362306a36Sopenharmony_ci ctx->min_hpages); 153462306a36Sopenharmony_ci if (!sbinfo->spool) 153562306a36Sopenharmony_ci goto out_free; 153662306a36Sopenharmony_ci } 153762306a36Sopenharmony_ci sb->s_maxbytes = MAX_LFS_FILESIZE; 153862306a36Sopenharmony_ci sb->s_blocksize = huge_page_size(ctx->hstate); 153962306a36Sopenharmony_ci sb->s_blocksize_bits = huge_page_shift(ctx->hstate); 154062306a36Sopenharmony_ci sb->s_magic = HUGETLBFS_MAGIC; 154162306a36Sopenharmony_ci sb->s_op = &hugetlbfs_ops; 154262306a36Sopenharmony_ci sb->s_time_gran = 1; 154362306a36Sopenharmony_ci 154462306a36Sopenharmony_ci /* 154562306a36Sopenharmony_ci * Due to the special and limited functionality of hugetlbfs, it does 154662306a36Sopenharmony_ci * not work well as a stacking filesystem. 154762306a36Sopenharmony_ci */ 154862306a36Sopenharmony_ci sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; 154962306a36Sopenharmony_ci sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx)); 155062306a36Sopenharmony_ci if (!sb->s_root) 155162306a36Sopenharmony_ci goto out_free; 155262306a36Sopenharmony_ci return 0; 155362306a36Sopenharmony_ciout_free: 155462306a36Sopenharmony_ci kfree(sbinfo->spool); 155562306a36Sopenharmony_ci kfree(sbinfo); 155662306a36Sopenharmony_ci return -ENOMEM; 155762306a36Sopenharmony_ci} 155862306a36Sopenharmony_ci 155962306a36Sopenharmony_cistatic int hugetlbfs_get_tree(struct fs_context *fc) 156062306a36Sopenharmony_ci{ 156162306a36Sopenharmony_ci int err = hugetlbfs_validate(fc); 156262306a36Sopenharmony_ci if (err) 156362306a36Sopenharmony_ci return err; 156462306a36Sopenharmony_ci return get_tree_nodev(fc, hugetlbfs_fill_super); 156562306a36Sopenharmony_ci} 156662306a36Sopenharmony_ci 156762306a36Sopenharmony_cistatic void hugetlbfs_fs_context_free(struct fs_context *fc) 156862306a36Sopenharmony_ci{ 156962306a36Sopenharmony_ci kfree(fc->fs_private); 157062306a36Sopenharmony_ci} 157162306a36Sopenharmony_ci 157262306a36Sopenharmony_cistatic const struct fs_context_operations hugetlbfs_fs_context_ops = { 157362306a36Sopenharmony_ci .free = hugetlbfs_fs_context_free, 157462306a36Sopenharmony_ci .parse_param = hugetlbfs_parse_param, 157562306a36Sopenharmony_ci .get_tree = hugetlbfs_get_tree, 157662306a36Sopenharmony_ci}; 157762306a36Sopenharmony_ci 157862306a36Sopenharmony_cistatic int hugetlbfs_init_fs_context(struct fs_context *fc) 157962306a36Sopenharmony_ci{ 158062306a36Sopenharmony_ci struct hugetlbfs_fs_context *ctx; 158162306a36Sopenharmony_ci 158262306a36Sopenharmony_ci ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL); 158362306a36Sopenharmony_ci if (!ctx) 158462306a36Sopenharmony_ci return -ENOMEM; 158562306a36Sopenharmony_ci 158662306a36Sopenharmony_ci ctx->max_hpages = -1; /* No limit on size by default */ 158762306a36Sopenharmony_ci ctx->nr_inodes = -1; /* No limit on number of inodes by default */ 158862306a36Sopenharmony_ci ctx->uid = current_fsuid(); 158962306a36Sopenharmony_ci ctx->gid = current_fsgid(); 159062306a36Sopenharmony_ci ctx->mode = 0755; 159162306a36Sopenharmony_ci ctx->hstate = &default_hstate; 159262306a36Sopenharmony_ci ctx->min_hpages = -1; /* No default minimum size */ 159362306a36Sopenharmony_ci ctx->max_val_type = NO_SIZE; 159462306a36Sopenharmony_ci ctx->min_val_type = NO_SIZE; 159562306a36Sopenharmony_ci fc->fs_private = ctx; 159662306a36Sopenharmony_ci fc->ops = &hugetlbfs_fs_context_ops; 159762306a36Sopenharmony_ci return 0; 159862306a36Sopenharmony_ci} 159962306a36Sopenharmony_ci 160062306a36Sopenharmony_cistatic struct file_system_type hugetlbfs_fs_type = { 160162306a36Sopenharmony_ci .name = "hugetlbfs", 160262306a36Sopenharmony_ci .init_fs_context = hugetlbfs_init_fs_context, 160362306a36Sopenharmony_ci .parameters = hugetlb_fs_parameters, 160462306a36Sopenharmony_ci .kill_sb = kill_litter_super, 160562306a36Sopenharmony_ci}; 160662306a36Sopenharmony_ci 160762306a36Sopenharmony_cistatic struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; 160862306a36Sopenharmony_ci 160962306a36Sopenharmony_cistatic int can_do_hugetlb_shm(void) 161062306a36Sopenharmony_ci{ 161162306a36Sopenharmony_ci kgid_t shm_group; 161262306a36Sopenharmony_ci shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); 161362306a36Sopenharmony_ci return capable(CAP_IPC_LOCK) || in_group_p(shm_group); 161462306a36Sopenharmony_ci} 161562306a36Sopenharmony_ci 161662306a36Sopenharmony_cistatic int get_hstate_idx(int page_size_log) 161762306a36Sopenharmony_ci{ 161862306a36Sopenharmony_ci struct hstate *h = hstate_sizelog(page_size_log); 161962306a36Sopenharmony_ci 162062306a36Sopenharmony_ci if (!h) 162162306a36Sopenharmony_ci return -1; 162262306a36Sopenharmony_ci return hstate_index(h); 162362306a36Sopenharmony_ci} 162462306a36Sopenharmony_ci 162562306a36Sopenharmony_ci/* 162662306a36Sopenharmony_ci * Note that size should be aligned to proper hugepage size in caller side, 162762306a36Sopenharmony_ci * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. 162862306a36Sopenharmony_ci */ 162962306a36Sopenharmony_cistruct file *hugetlb_file_setup(const char *name, size_t size, 163062306a36Sopenharmony_ci vm_flags_t acctflag, int creat_flags, 163162306a36Sopenharmony_ci int page_size_log) 163262306a36Sopenharmony_ci{ 163362306a36Sopenharmony_ci struct inode *inode; 163462306a36Sopenharmony_ci struct vfsmount *mnt; 163562306a36Sopenharmony_ci int hstate_idx; 163662306a36Sopenharmony_ci struct file *file; 163762306a36Sopenharmony_ci 163862306a36Sopenharmony_ci hstate_idx = get_hstate_idx(page_size_log); 163962306a36Sopenharmony_ci if (hstate_idx < 0) 164062306a36Sopenharmony_ci return ERR_PTR(-ENODEV); 164162306a36Sopenharmony_ci 164262306a36Sopenharmony_ci mnt = hugetlbfs_vfsmount[hstate_idx]; 164362306a36Sopenharmony_ci if (!mnt) 164462306a36Sopenharmony_ci return ERR_PTR(-ENOENT); 164562306a36Sopenharmony_ci 164662306a36Sopenharmony_ci if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 164762306a36Sopenharmony_ci struct ucounts *ucounts = current_ucounts(); 164862306a36Sopenharmony_ci 164962306a36Sopenharmony_ci if (user_shm_lock(size, ucounts)) { 165062306a36Sopenharmony_ci pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n", 165162306a36Sopenharmony_ci current->comm, current->pid); 165262306a36Sopenharmony_ci user_shm_unlock(size, ucounts); 165362306a36Sopenharmony_ci } 165462306a36Sopenharmony_ci return ERR_PTR(-EPERM); 165562306a36Sopenharmony_ci } 165662306a36Sopenharmony_ci 165762306a36Sopenharmony_ci file = ERR_PTR(-ENOSPC); 165862306a36Sopenharmony_ci inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0); 165962306a36Sopenharmony_ci if (!inode) 166062306a36Sopenharmony_ci goto out; 166162306a36Sopenharmony_ci if (creat_flags == HUGETLB_SHMFS_INODE) 166262306a36Sopenharmony_ci inode->i_flags |= S_PRIVATE; 166362306a36Sopenharmony_ci 166462306a36Sopenharmony_ci inode->i_size = size; 166562306a36Sopenharmony_ci clear_nlink(inode); 166662306a36Sopenharmony_ci 166762306a36Sopenharmony_ci if (!hugetlb_reserve_pages(inode, 0, 166862306a36Sopenharmony_ci size >> huge_page_shift(hstate_inode(inode)), NULL, 166962306a36Sopenharmony_ci acctflag)) 167062306a36Sopenharmony_ci file = ERR_PTR(-ENOMEM); 167162306a36Sopenharmony_ci else 167262306a36Sopenharmony_ci file = alloc_file_pseudo(inode, mnt, name, O_RDWR, 167362306a36Sopenharmony_ci &hugetlbfs_file_operations); 167462306a36Sopenharmony_ci if (!IS_ERR(file)) 167562306a36Sopenharmony_ci return file; 167662306a36Sopenharmony_ci 167762306a36Sopenharmony_ci iput(inode); 167862306a36Sopenharmony_ciout: 167962306a36Sopenharmony_ci return file; 168062306a36Sopenharmony_ci} 168162306a36Sopenharmony_ci 168262306a36Sopenharmony_cistatic struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) 168362306a36Sopenharmony_ci{ 168462306a36Sopenharmony_ci struct fs_context *fc; 168562306a36Sopenharmony_ci struct vfsmount *mnt; 168662306a36Sopenharmony_ci 168762306a36Sopenharmony_ci fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT); 168862306a36Sopenharmony_ci if (IS_ERR(fc)) { 168962306a36Sopenharmony_ci mnt = ERR_CAST(fc); 169062306a36Sopenharmony_ci } else { 169162306a36Sopenharmony_ci struct hugetlbfs_fs_context *ctx = fc->fs_private; 169262306a36Sopenharmony_ci ctx->hstate = h; 169362306a36Sopenharmony_ci mnt = fc_mount(fc); 169462306a36Sopenharmony_ci put_fs_context(fc); 169562306a36Sopenharmony_ci } 169662306a36Sopenharmony_ci if (IS_ERR(mnt)) 169762306a36Sopenharmony_ci pr_err("Cannot mount internal hugetlbfs for page size %luK", 169862306a36Sopenharmony_ci huge_page_size(h) / SZ_1K); 169962306a36Sopenharmony_ci return mnt; 170062306a36Sopenharmony_ci} 170162306a36Sopenharmony_ci 170262306a36Sopenharmony_cistatic int __init init_hugetlbfs_fs(void) 170362306a36Sopenharmony_ci{ 170462306a36Sopenharmony_ci struct vfsmount *mnt; 170562306a36Sopenharmony_ci struct hstate *h; 170662306a36Sopenharmony_ci int error; 170762306a36Sopenharmony_ci int i; 170862306a36Sopenharmony_ci 170962306a36Sopenharmony_ci if (!hugepages_supported()) { 171062306a36Sopenharmony_ci pr_info("disabling because there are no supported hugepage sizes\n"); 171162306a36Sopenharmony_ci return -ENOTSUPP; 171262306a36Sopenharmony_ci } 171362306a36Sopenharmony_ci 171462306a36Sopenharmony_ci error = -ENOMEM; 171562306a36Sopenharmony_ci hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", 171662306a36Sopenharmony_ci sizeof(struct hugetlbfs_inode_info), 171762306a36Sopenharmony_ci 0, SLAB_ACCOUNT, init_once); 171862306a36Sopenharmony_ci if (hugetlbfs_inode_cachep == NULL) 171962306a36Sopenharmony_ci goto out; 172062306a36Sopenharmony_ci 172162306a36Sopenharmony_ci error = register_filesystem(&hugetlbfs_fs_type); 172262306a36Sopenharmony_ci if (error) 172362306a36Sopenharmony_ci goto out_free; 172462306a36Sopenharmony_ci 172562306a36Sopenharmony_ci /* default hstate mount is required */ 172662306a36Sopenharmony_ci mnt = mount_one_hugetlbfs(&default_hstate); 172762306a36Sopenharmony_ci if (IS_ERR(mnt)) { 172862306a36Sopenharmony_ci error = PTR_ERR(mnt); 172962306a36Sopenharmony_ci goto out_unreg; 173062306a36Sopenharmony_ci } 173162306a36Sopenharmony_ci hugetlbfs_vfsmount[default_hstate_idx] = mnt; 173262306a36Sopenharmony_ci 173362306a36Sopenharmony_ci /* other hstates are optional */ 173462306a36Sopenharmony_ci i = 0; 173562306a36Sopenharmony_ci for_each_hstate(h) { 173662306a36Sopenharmony_ci if (i == default_hstate_idx) { 173762306a36Sopenharmony_ci i++; 173862306a36Sopenharmony_ci continue; 173962306a36Sopenharmony_ci } 174062306a36Sopenharmony_ci 174162306a36Sopenharmony_ci mnt = mount_one_hugetlbfs(h); 174262306a36Sopenharmony_ci if (IS_ERR(mnt)) 174362306a36Sopenharmony_ci hugetlbfs_vfsmount[i] = NULL; 174462306a36Sopenharmony_ci else 174562306a36Sopenharmony_ci hugetlbfs_vfsmount[i] = mnt; 174662306a36Sopenharmony_ci i++; 174762306a36Sopenharmony_ci } 174862306a36Sopenharmony_ci 174962306a36Sopenharmony_ci return 0; 175062306a36Sopenharmony_ci 175162306a36Sopenharmony_ci out_unreg: 175262306a36Sopenharmony_ci (void)unregister_filesystem(&hugetlbfs_fs_type); 175362306a36Sopenharmony_ci out_free: 175462306a36Sopenharmony_ci kmem_cache_destroy(hugetlbfs_inode_cachep); 175562306a36Sopenharmony_ci out: 175662306a36Sopenharmony_ci return error; 175762306a36Sopenharmony_ci} 175862306a36Sopenharmony_cifs_initcall(init_hugetlbfs_fs) 1759