xref: /kernel/linux/linux-6.6/fs/hugetlbfs/inode.c (revision 62306a36)
1/*
2 * hugetlbpage-backed filesystem.  Based on ramfs.
3 *
4 * Nadia Yvette Chambers, 2002
5 *
6 * Copyright (C) 2002 Linus Torvalds.
7 * License: GPL
8 */
9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
12#include <linux/thread_info.h>
13#include <asm/current.h>
14#include <linux/falloc.h>
15#include <linux/fs.h>
16#include <linux/mount.h>
17#include <linux/file.h>
18#include <linux/kernel.h>
19#include <linux/writeback.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/init.h>
23#include <linux/string.h>
24#include <linux/capability.h>
25#include <linux/ctype.h>
26#include <linux/backing-dev.h>
27#include <linux/hugetlb.h>
28#include <linux/pagevec.h>
29#include <linux/fs_parser.h>
30#include <linux/mman.h>
31#include <linux/slab.h>
32#include <linux/dnotify.h>
33#include <linux/statfs.h>
34#include <linux/security.h>
35#include <linux/magic.h>
36#include <linux/migrate.h>
37#include <linux/uio.h>
38
39#include <linux/uaccess.h>
40#include <linux/sched/mm.h>
41
42static const struct address_space_operations hugetlbfs_aops;
43const struct file_operations hugetlbfs_file_operations;
44static const struct inode_operations hugetlbfs_dir_inode_operations;
45static const struct inode_operations hugetlbfs_inode_operations;
46
47enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
48
49struct hugetlbfs_fs_context {
50	struct hstate		*hstate;
51	unsigned long long	max_size_opt;
52	unsigned long long	min_size_opt;
53	long			max_hpages;
54	long			nr_inodes;
55	long			min_hpages;
56	enum hugetlbfs_size_type max_val_type;
57	enum hugetlbfs_size_type min_val_type;
58	kuid_t			uid;
59	kgid_t			gid;
60	umode_t			mode;
61};
62
63int sysctl_hugetlb_shm_group;
64
65enum hugetlb_param {
66	Opt_gid,
67	Opt_min_size,
68	Opt_mode,
69	Opt_nr_inodes,
70	Opt_pagesize,
71	Opt_size,
72	Opt_uid,
73};
74
75static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
76	fsparam_u32   ("gid",		Opt_gid),
77	fsparam_string("min_size",	Opt_min_size),
78	fsparam_u32oct("mode",		Opt_mode),
79	fsparam_string("nr_inodes",	Opt_nr_inodes),
80	fsparam_string("pagesize",	Opt_pagesize),
81	fsparam_string("size",		Opt_size),
82	fsparam_u32   ("uid",		Opt_uid),
83	{}
84};
85
86#ifdef CONFIG_NUMA
87static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
88					struct inode *inode, pgoff_t index)
89{
90	vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
91							index);
92}
93
94static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
95{
96	mpol_cond_put(vma->vm_policy);
97}
98#else
99static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
100					struct inode *inode, pgoff_t index)
101{
102}
103
104static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
105{
106}
107#endif
108
109/*
110 * Mask used when checking the page offset value passed in via system
111 * calls.  This value will be converted to a loff_t which is signed.
112 * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
113 * value.  The extra bit (- 1 in the shift value) is to take the sign
114 * bit into account.
115 */
116#define PGOFF_LOFFT_MAX \
117	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
118
119static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
120{
121	struct inode *inode = file_inode(file);
122	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
123	loff_t len, vma_len;
124	int ret;
125	struct hstate *h = hstate_file(file);
126	vm_flags_t vm_flags;
127
128	/*
129	 * vma address alignment (but not the pgoff alignment) has
130	 * already been checked by prepare_hugepage_range.  If you add
131	 * any error returns here, do so after setting VM_HUGETLB, so
132	 * is_vm_hugetlb_page tests below unmap_region go the right
133	 * way when do_mmap unwinds (may be important on powerpc
134	 * and ia64).
135	 */
136	vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
137	vma->vm_ops = &hugetlb_vm_ops;
138
139	ret = seal_check_future_write(info->seals, vma);
140	if (ret)
141		return ret;
142
143	/*
144	 * page based offset in vm_pgoff could be sufficiently large to
145	 * overflow a loff_t when converted to byte offset.  This can
146	 * only happen on architectures where sizeof(loff_t) ==
147	 * sizeof(unsigned long).  So, only check in those instances.
148	 */
149	if (sizeof(unsigned long) == sizeof(loff_t)) {
150		if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
151			return -EINVAL;
152	}
153
154	/* must be huge page aligned */
155	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
156		return -EINVAL;
157
158	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
159	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
160	/* check for overflow */
161	if (len < vma_len)
162		return -EINVAL;
163
164	inode_lock(inode);
165	file_accessed(file);
166
167	ret = -ENOMEM;
168
169	vm_flags = vma->vm_flags;
170	/*
171	 * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
172	 * reserving here. Note: only for SHM hugetlbfs file, the inode
173	 * flag S_PRIVATE is set.
174	 */
175	if (inode->i_flags & S_PRIVATE)
176		vm_flags |= VM_NORESERVE;
177
178	if (!hugetlb_reserve_pages(inode,
179				vma->vm_pgoff >> huge_page_order(h),
180				len >> huge_page_shift(h), vma,
181				vm_flags))
182		goto out;
183
184	ret = 0;
185	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
186		i_size_write(inode, len);
187out:
188	inode_unlock(inode);
189
190	return ret;
191}
192
193/*
194 * Called under mmap_write_lock(mm).
195 */
196
197static unsigned long
198hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
199		unsigned long len, unsigned long pgoff, unsigned long flags)
200{
201	struct hstate *h = hstate_file(file);
202	struct vm_unmapped_area_info info;
203
204	info.flags = 0;
205	info.length = len;
206	info.low_limit = current->mm->mmap_base;
207	info.high_limit = arch_get_mmap_end(addr, len, flags);
208	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
209	info.align_offset = 0;
210	return vm_unmapped_area(&info);
211}
212
213static unsigned long
214hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
215		unsigned long len, unsigned long pgoff, unsigned long flags)
216{
217	struct hstate *h = hstate_file(file);
218	struct vm_unmapped_area_info info;
219
220	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
221	info.length = len;
222	info.low_limit = PAGE_SIZE;
223	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
224	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
225	info.align_offset = 0;
226	addr = vm_unmapped_area(&info);
227
228	/*
229	 * A failed mmap() very likely causes application failure,
230	 * so fall back to the bottom-up function here. This scenario
231	 * can happen with large stack limits and large mmap()
232	 * allocations.
233	 */
234	if (unlikely(offset_in_page(addr))) {
235		VM_BUG_ON(addr != -ENOMEM);
236		info.flags = 0;
237		info.low_limit = current->mm->mmap_base;
238		info.high_limit = arch_get_mmap_end(addr, len, flags);
239		addr = vm_unmapped_area(&info);
240	}
241
242	return addr;
243}
244
245unsigned long
246generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
247				  unsigned long len, unsigned long pgoff,
248				  unsigned long flags)
249{
250	struct mm_struct *mm = current->mm;
251	struct vm_area_struct *vma;
252	struct hstate *h = hstate_file(file);
253	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
254
255	if (len & ~huge_page_mask(h))
256		return -EINVAL;
257	if (len > TASK_SIZE)
258		return -ENOMEM;
259
260	if (flags & MAP_FIXED) {
261		if (prepare_hugepage_range(file, addr, len))
262			return -EINVAL;
263		return addr;
264	}
265
266	if (addr) {
267		addr = ALIGN(addr, huge_page_size(h));
268		vma = find_vma(mm, addr);
269		if (mmap_end - len >= addr &&
270		    (!vma || addr + len <= vm_start_gap(vma)))
271			return addr;
272	}
273
274	/*
275	 * Use mm->get_unmapped_area value as a hint to use topdown routine.
276	 * If architectures have special needs, they should define their own
277	 * version of hugetlb_get_unmapped_area.
278	 */
279	if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
280		return hugetlb_get_unmapped_area_topdown(file, addr, len,
281				pgoff, flags);
282	return hugetlb_get_unmapped_area_bottomup(file, addr, len,
283			pgoff, flags);
284}
285
286#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
287static unsigned long
288hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
289			  unsigned long len, unsigned long pgoff,
290			  unsigned long flags)
291{
292	return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
293}
294#endif
295
296/*
297 * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
298 * Returns the maximum number of bytes one can read without touching the 1st raw
299 * HWPOISON subpage.
300 *
301 * The implementation borrows the iteration logic from copy_page_to_iter*.
302 */
303static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes)
304{
305	size_t n = 0;
306	size_t res = 0;
307
308	/* First subpage to start the loop. */
309	page = nth_page(page, offset / PAGE_SIZE);
310	offset %= PAGE_SIZE;
311	while (1) {
312		if (is_raw_hwpoison_page_in_hugepage(page))
313			break;
314
315		/* Safe to read n bytes without touching HWPOISON subpage. */
316		n = min(bytes, (size_t)PAGE_SIZE - offset);
317		res += n;
318		bytes -= n;
319		if (!bytes || !n)
320			break;
321		offset += n;
322		if (offset == PAGE_SIZE) {
323			page = nth_page(page, 1);
324			offset = 0;
325		}
326	}
327
328	return res;
329}
330
331/*
332 * Support for read() - Find the page attached to f_mapping and copy out the
333 * data. This provides functionality similar to filemap_read().
334 */
335static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
336{
337	struct file *file = iocb->ki_filp;
338	struct hstate *h = hstate_file(file);
339	struct address_space *mapping = file->f_mapping;
340	struct inode *inode = mapping->host;
341	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
342	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
343	unsigned long end_index;
344	loff_t isize;
345	ssize_t retval = 0;
346
347	while (iov_iter_count(to)) {
348		struct page *page;
349		size_t nr, copied, want;
350
351		/* nr is the maximum number of bytes to copy from this page */
352		nr = huge_page_size(h);
353		isize = i_size_read(inode);
354		if (!isize)
355			break;
356		end_index = (isize - 1) >> huge_page_shift(h);
357		if (index > end_index)
358			break;
359		if (index == end_index) {
360			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
361			if (nr <= offset)
362				break;
363		}
364		nr = nr - offset;
365
366		/* Find the page */
367		page = find_lock_page(mapping, index);
368		if (unlikely(page == NULL)) {
369			/*
370			 * We have a HOLE, zero out the user-buffer for the
371			 * length of the hole or request.
372			 */
373			copied = iov_iter_zero(nr, to);
374		} else {
375			unlock_page(page);
376
377			if (!PageHWPoison(page))
378				want = nr;
379			else {
380				/*
381				 * Adjust how many bytes safe to read without
382				 * touching the 1st raw HWPOISON subpage after
383				 * offset.
384				 */
385				want = adjust_range_hwpoison(page, offset, nr);
386				if (want == 0) {
387					put_page(page);
388					retval = -EIO;
389					break;
390				}
391			}
392
393			/*
394			 * We have the page, copy it to user space buffer.
395			 */
396			copied = copy_page_to_iter(page, offset, want, to);
397			put_page(page);
398		}
399		offset += copied;
400		retval += copied;
401		if (copied != nr && iov_iter_count(to)) {
402			if (!retval)
403				retval = -EFAULT;
404			break;
405		}
406		index += offset >> huge_page_shift(h);
407		offset &= ~huge_page_mask(h);
408	}
409	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
410	return retval;
411}
412
413static int hugetlbfs_write_begin(struct file *file,
414			struct address_space *mapping,
415			loff_t pos, unsigned len,
416			struct page **pagep, void **fsdata)
417{
418	return -EINVAL;
419}
420
421static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
422			loff_t pos, unsigned len, unsigned copied,
423			struct page *page, void *fsdata)
424{
425	BUG();
426	return -EINVAL;
427}
428
429static void hugetlb_delete_from_page_cache(struct folio *folio)
430{
431	folio_clear_dirty(folio);
432	folio_clear_uptodate(folio);
433	filemap_remove_folio(folio);
434}
435
436/*
437 * Called with i_mmap_rwsem held for inode based vma maps.  This makes
438 * sure vma (and vm_mm) will not go away.  We also hold the hugetlb fault
439 * mutex for the page in the mapping.  So, we can not race with page being
440 * faulted into the vma.
441 */
442static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
443				unsigned long addr, struct page *page)
444{
445	pte_t *ptep, pte;
446
447	ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
448	if (!ptep)
449		return false;
450
451	pte = huge_ptep_get(ptep);
452	if (huge_pte_none(pte) || !pte_present(pte))
453		return false;
454
455	if (pte_page(pte) == page)
456		return true;
457
458	return false;
459}
460
461/*
462 * Can vma_offset_start/vma_offset_end overflow on 32-bit arches?
463 * No, because the interval tree returns us only those vmas
464 * which overlap the truncated area starting at pgoff,
465 * and no vma on a 32-bit arch can span beyond the 4GB.
466 */
467static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
468{
469	unsigned long offset = 0;
470
471	if (vma->vm_pgoff < start)
472		offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
473
474	return vma->vm_start + offset;
475}
476
477static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
478{
479	unsigned long t_end;
480
481	if (!end)
482		return vma->vm_end;
483
484	t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
485	if (t_end > vma->vm_end)
486		t_end = vma->vm_end;
487	return t_end;
488}
489
490/*
491 * Called with hugetlb fault mutex held.  Therefore, no more mappings to
492 * this folio can be created while executing the routine.
493 */
494static void hugetlb_unmap_file_folio(struct hstate *h,
495					struct address_space *mapping,
496					struct folio *folio, pgoff_t index)
497{
498	struct rb_root_cached *root = &mapping->i_mmap;
499	struct hugetlb_vma_lock *vma_lock;
500	struct page *page = &folio->page;
501	struct vm_area_struct *vma;
502	unsigned long v_start;
503	unsigned long v_end;
504	pgoff_t start, end;
505
506	start = index * pages_per_huge_page(h);
507	end = (index + 1) * pages_per_huge_page(h);
508
509	i_mmap_lock_write(mapping);
510retry:
511	vma_lock = NULL;
512	vma_interval_tree_foreach(vma, root, start, end - 1) {
513		v_start = vma_offset_start(vma, start);
514		v_end = vma_offset_end(vma, end);
515
516		if (!hugetlb_vma_maps_page(vma, v_start, page))
517			continue;
518
519		if (!hugetlb_vma_trylock_write(vma)) {
520			vma_lock = vma->vm_private_data;
521			/*
522			 * If we can not get vma lock, we need to drop
523			 * immap_sema and take locks in order.  First,
524			 * take a ref on the vma_lock structure so that
525			 * we can be guaranteed it will not go away when
526			 * dropping immap_sema.
527			 */
528			kref_get(&vma_lock->refs);
529			break;
530		}
531
532		unmap_hugepage_range(vma, v_start, v_end, NULL,
533				     ZAP_FLAG_DROP_MARKER);
534		hugetlb_vma_unlock_write(vma);
535	}
536
537	i_mmap_unlock_write(mapping);
538
539	if (vma_lock) {
540		/*
541		 * Wait on vma_lock.  We know it is still valid as we have
542		 * a reference.  We must 'open code' vma locking as we do
543		 * not know if vma_lock is still attached to vma.
544		 */
545		down_write(&vma_lock->rw_sema);
546		i_mmap_lock_write(mapping);
547
548		vma = vma_lock->vma;
549		if (!vma) {
550			/*
551			 * If lock is no longer attached to vma, then just
552			 * unlock, drop our reference and retry looking for
553			 * other vmas.
554			 */
555			up_write(&vma_lock->rw_sema);
556			kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
557			goto retry;
558		}
559
560		/*
561		 * vma_lock is still attached to vma.  Check to see if vma
562		 * still maps page and if so, unmap.
563		 */
564		v_start = vma_offset_start(vma, start);
565		v_end = vma_offset_end(vma, end);
566		if (hugetlb_vma_maps_page(vma, v_start, page))
567			unmap_hugepage_range(vma, v_start, v_end, NULL,
568					     ZAP_FLAG_DROP_MARKER);
569
570		kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
571		hugetlb_vma_unlock_write(vma);
572
573		goto retry;
574	}
575}
576
577static void
578hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
579		      zap_flags_t zap_flags)
580{
581	struct vm_area_struct *vma;
582
583	/*
584	 * end == 0 indicates that the entire range after start should be
585	 * unmapped.  Note, end is exclusive, whereas the interval tree takes
586	 * an inclusive "last".
587	 */
588	vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
589		unsigned long v_start;
590		unsigned long v_end;
591
592		if (!hugetlb_vma_trylock_write(vma))
593			continue;
594
595		v_start = vma_offset_start(vma, start);
596		v_end = vma_offset_end(vma, end);
597
598		unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags);
599
600		/*
601		 * Note that vma lock only exists for shared/non-private
602		 * vmas.  Therefore, lock is not held when calling
603		 * unmap_hugepage_range for private vmas.
604		 */
605		hugetlb_vma_unlock_write(vma);
606	}
607}
608
609/*
610 * Called with hugetlb fault mutex held.
611 * Returns true if page was actually removed, false otherwise.
612 */
613static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
614					struct address_space *mapping,
615					struct folio *folio, pgoff_t index,
616					bool truncate_op)
617{
618	bool ret = false;
619
620	/*
621	 * If folio is mapped, it was faulted in after being
622	 * unmapped in caller.  Unmap (again) while holding
623	 * the fault mutex.  The mutex will prevent faults
624	 * until we finish removing the folio.
625	 */
626	if (unlikely(folio_mapped(folio)))
627		hugetlb_unmap_file_folio(h, mapping, folio, index);
628
629	folio_lock(folio);
630	/*
631	 * We must remove the folio from page cache before removing
632	 * the region/ reserve map (hugetlb_unreserve_pages).  In
633	 * rare out of memory conditions, removal of the region/reserve
634	 * map could fail.  Correspondingly, the subpool and global
635	 * reserve usage count can need to be adjusted.
636	 */
637	VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
638	hugetlb_delete_from_page_cache(folio);
639	ret = true;
640	if (!truncate_op) {
641		if (unlikely(hugetlb_unreserve_pages(inode, index,
642							index + 1, 1)))
643			hugetlb_fix_reserve_counts(inode);
644	}
645
646	folio_unlock(folio);
647	return ret;
648}
649
650/*
651 * remove_inode_hugepages handles two distinct cases: truncation and hole
652 * punch.  There are subtle differences in operation for each case.
653 *
654 * truncation is indicated by end of range being LLONG_MAX
655 *	In this case, we first scan the range and release found pages.
656 *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
657 *	maps and global counts.  Page faults can race with truncation.
658 *	During faults, hugetlb_no_page() checks i_size before page allocation,
659 *	and again after obtaining page table lock.  It will 'back out'
660 *	allocations in the truncated range.
661 * hole punch is indicated if end is not LLONG_MAX
662 *	In the hole punch case we scan the range and release found pages.
663 *	Only when releasing a page is the associated region/reserve map
664 *	deleted.  The region/reserve map for ranges without associated
665 *	pages are not modified.  Page faults can race with hole punch.
666 *	This is indicated if we find a mapped page.
667 * Note: If the passed end of range value is beyond the end of file, but
668 * not LLONG_MAX this routine still performs a hole punch operation.
669 */
670static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
671				   loff_t lend)
672{
673	struct hstate *h = hstate_inode(inode);
674	struct address_space *mapping = &inode->i_data;
675	const pgoff_t start = lstart >> huge_page_shift(h);
676	const pgoff_t end = lend >> huge_page_shift(h);
677	struct folio_batch fbatch;
678	pgoff_t next, index;
679	int i, freed = 0;
680	bool truncate_op = (lend == LLONG_MAX);
681
682	folio_batch_init(&fbatch);
683	next = start;
684	while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
685		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
686			struct folio *folio = fbatch.folios[i];
687			u32 hash = 0;
688
689			index = folio->index;
690			hash = hugetlb_fault_mutex_hash(mapping, index);
691			mutex_lock(&hugetlb_fault_mutex_table[hash]);
692
693			/*
694			 * Remove folio that was part of folio_batch.
695			 */
696			if (remove_inode_single_folio(h, inode, mapping, folio,
697							index, truncate_op))
698				freed++;
699
700			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
701		}
702		folio_batch_release(&fbatch);
703		cond_resched();
704	}
705
706	if (truncate_op)
707		(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
708}
709
710static void hugetlbfs_evict_inode(struct inode *inode)
711{
712	struct resv_map *resv_map;
713
714	remove_inode_hugepages(inode, 0, LLONG_MAX);
715
716	/*
717	 * Get the resv_map from the address space embedded in the inode.
718	 * This is the address space which points to any resv_map allocated
719	 * at inode creation time.  If this is a device special inode,
720	 * i_mapping may not point to the original address space.
721	 */
722	resv_map = (struct resv_map *)(&inode->i_data)->private_data;
723	/* Only regular and link inodes have associated reserve maps */
724	if (resv_map)
725		resv_map_release(&resv_map->refs);
726	clear_inode(inode);
727}
728
729static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
730{
731	pgoff_t pgoff;
732	struct address_space *mapping = inode->i_mapping;
733	struct hstate *h = hstate_inode(inode);
734
735	BUG_ON(offset & ~huge_page_mask(h));
736	pgoff = offset >> PAGE_SHIFT;
737
738	i_size_write(inode, offset);
739	i_mmap_lock_write(mapping);
740	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
741		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
742				      ZAP_FLAG_DROP_MARKER);
743	i_mmap_unlock_write(mapping);
744	remove_inode_hugepages(inode, offset, LLONG_MAX);
745}
746
747static void hugetlbfs_zero_partial_page(struct hstate *h,
748					struct address_space *mapping,
749					loff_t start,
750					loff_t end)
751{
752	pgoff_t idx = start >> huge_page_shift(h);
753	struct folio *folio;
754
755	folio = filemap_lock_folio(mapping, idx);
756	if (IS_ERR(folio))
757		return;
758
759	start = start & ~huge_page_mask(h);
760	end = end & ~huge_page_mask(h);
761	if (!end)
762		end = huge_page_size(h);
763
764	folio_zero_segment(folio, (size_t)start, (size_t)end);
765
766	folio_unlock(folio);
767	folio_put(folio);
768}
769
770static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
771{
772	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
773	struct address_space *mapping = inode->i_mapping;
774	struct hstate *h = hstate_inode(inode);
775	loff_t hpage_size = huge_page_size(h);
776	loff_t hole_start, hole_end;
777
778	/*
779	 * hole_start and hole_end indicate the full pages within the hole.
780	 */
781	hole_start = round_up(offset, hpage_size);
782	hole_end = round_down(offset + len, hpage_size);
783
784	inode_lock(inode);
785
786	/* protected by i_rwsem */
787	if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
788		inode_unlock(inode);
789		return -EPERM;
790	}
791
792	i_mmap_lock_write(mapping);
793
794	/* If range starts before first full page, zero partial page. */
795	if (offset < hole_start)
796		hugetlbfs_zero_partial_page(h, mapping,
797				offset, min(offset + len, hole_start));
798
799	/* Unmap users of full pages in the hole. */
800	if (hole_end > hole_start) {
801		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
802			hugetlb_vmdelete_list(&mapping->i_mmap,
803					      hole_start >> PAGE_SHIFT,
804					      hole_end >> PAGE_SHIFT, 0);
805	}
806
807	/* If range extends beyond last full page, zero partial page. */
808	if ((offset + len) > hole_end && (offset + len) > hole_start)
809		hugetlbfs_zero_partial_page(h, mapping,
810				hole_end, offset + len);
811
812	i_mmap_unlock_write(mapping);
813
814	/* Remove full pages from the file. */
815	if (hole_end > hole_start)
816		remove_inode_hugepages(inode, hole_start, hole_end);
817
818	inode_unlock(inode);
819
820	return 0;
821}
822
823static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
824				loff_t len)
825{
826	struct inode *inode = file_inode(file);
827	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
828	struct address_space *mapping = inode->i_mapping;
829	struct hstate *h = hstate_inode(inode);
830	struct vm_area_struct pseudo_vma;
831	struct mm_struct *mm = current->mm;
832	loff_t hpage_size = huge_page_size(h);
833	unsigned long hpage_shift = huge_page_shift(h);
834	pgoff_t start, index, end;
835	int error;
836	u32 hash;
837
838	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
839		return -EOPNOTSUPP;
840
841	if (mode & FALLOC_FL_PUNCH_HOLE)
842		return hugetlbfs_punch_hole(inode, offset, len);
843
844	/*
845	 * Default preallocate case.
846	 * For this range, start is rounded down and end is rounded up
847	 * as well as being converted to page offsets.
848	 */
849	start = offset >> hpage_shift;
850	end = (offset + len + hpage_size - 1) >> hpage_shift;
851
852	inode_lock(inode);
853
854	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
855	error = inode_newsize_ok(inode, offset + len);
856	if (error)
857		goto out;
858
859	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
860		error = -EPERM;
861		goto out;
862	}
863
864	/*
865	 * Initialize a pseudo vma as this is required by the huge page
866	 * allocation routines.  If NUMA is configured, use page index
867	 * as input to create an allocation policy.
868	 */
869	vma_init(&pseudo_vma, mm);
870	vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
871	pseudo_vma.vm_file = file;
872
873	for (index = start; index < end; index++) {
874		/*
875		 * This is supposed to be the vaddr where the page is being
876		 * faulted in, but we have no vaddr here.
877		 */
878		struct folio *folio;
879		unsigned long addr;
880
881		cond_resched();
882
883		/*
884		 * fallocate(2) manpage permits EINTR; we may have been
885		 * interrupted because we are using up too much memory.
886		 */
887		if (signal_pending(current)) {
888			error = -EINTR;
889			break;
890		}
891
892		/* addr is the offset within the file (zero based) */
893		addr = index * hpage_size;
894
895		/* mutex taken here, fault path and hole punch */
896		hash = hugetlb_fault_mutex_hash(mapping, index);
897		mutex_lock(&hugetlb_fault_mutex_table[hash]);
898
899		/* See if already present in mapping to avoid alloc/free */
900		folio = filemap_get_folio(mapping, index);
901		if (!IS_ERR(folio)) {
902			folio_put(folio);
903			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
904			continue;
905		}
906
907		/*
908		 * Allocate folio without setting the avoid_reserve argument.
909		 * There certainly are no reserves associated with the
910		 * pseudo_vma.  However, there could be shared mappings with
911		 * reserves for the file at the inode level.  If we fallocate
912		 * folios in these areas, we need to consume the reserves
913		 * to keep reservation accounting consistent.
914		 */
915		hugetlb_set_vma_policy(&pseudo_vma, inode, index);
916		folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0);
917		hugetlb_drop_vma_policy(&pseudo_vma);
918		if (IS_ERR(folio)) {
919			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
920			error = PTR_ERR(folio);
921			goto out;
922		}
923		clear_huge_page(&folio->page, addr, pages_per_huge_page(h));
924		__folio_mark_uptodate(folio);
925		error = hugetlb_add_to_page_cache(folio, mapping, index);
926		if (unlikely(error)) {
927			restore_reserve_on_error(h, &pseudo_vma, addr, folio);
928			folio_put(folio);
929			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
930			goto out;
931		}
932
933		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
934
935		folio_set_hugetlb_migratable(folio);
936		/*
937		 * folio_unlock because locked by hugetlb_add_to_page_cache()
938		 * folio_put() due to reference from alloc_hugetlb_folio()
939		 */
940		folio_unlock(folio);
941		folio_put(folio);
942	}
943
944	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
945		i_size_write(inode, offset + len);
946	inode_set_ctime_current(inode);
947out:
948	inode_unlock(inode);
949	return error;
950}
951
952static int hugetlbfs_setattr(struct mnt_idmap *idmap,
953			     struct dentry *dentry, struct iattr *attr)
954{
955	struct inode *inode = d_inode(dentry);
956	struct hstate *h = hstate_inode(inode);
957	int error;
958	unsigned int ia_valid = attr->ia_valid;
959	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
960
961	error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
962	if (error)
963		return error;
964
965	if (ia_valid & ATTR_SIZE) {
966		loff_t oldsize = inode->i_size;
967		loff_t newsize = attr->ia_size;
968
969		if (newsize & ~huge_page_mask(h))
970			return -EINVAL;
971		/* protected by i_rwsem */
972		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
973		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
974			return -EPERM;
975		hugetlb_vmtruncate(inode, newsize);
976	}
977
978	setattr_copy(&nop_mnt_idmap, inode, attr);
979	mark_inode_dirty(inode);
980	return 0;
981}
982
983static struct inode *hugetlbfs_get_root(struct super_block *sb,
984					struct hugetlbfs_fs_context *ctx)
985{
986	struct inode *inode;
987
988	inode = new_inode(sb);
989	if (inode) {
990		inode->i_ino = get_next_ino();
991		inode->i_mode = S_IFDIR | ctx->mode;
992		inode->i_uid = ctx->uid;
993		inode->i_gid = ctx->gid;
994		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
995		inode->i_op = &hugetlbfs_dir_inode_operations;
996		inode->i_fop = &simple_dir_operations;
997		/* directory inodes start off with i_nlink == 2 (for "." entry) */
998		inc_nlink(inode);
999		lockdep_annotate_inode_mutex_key(inode);
1000	}
1001	return inode;
1002}
1003
1004/*
1005 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
1006 * be taken from reclaim -- unlike regular filesystems. This needs an
1007 * annotation because huge_pmd_share() does an allocation under hugetlb's
1008 * i_mmap_rwsem.
1009 */
1010static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
1011
1012static struct inode *hugetlbfs_get_inode(struct super_block *sb,
1013					struct inode *dir,
1014					umode_t mode, dev_t dev)
1015{
1016	struct inode *inode;
1017	struct resv_map *resv_map = NULL;
1018
1019	/*
1020	 * Reserve maps are only needed for inodes that can have associated
1021	 * page allocations.
1022	 */
1023	if (S_ISREG(mode) || S_ISLNK(mode)) {
1024		resv_map = resv_map_alloc();
1025		if (!resv_map)
1026			return NULL;
1027	}
1028
1029	inode = new_inode(sb);
1030	if (inode) {
1031		struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
1032
1033		inode->i_ino = get_next_ino();
1034		inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
1035		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
1036				&hugetlbfs_i_mmap_rwsem_key);
1037		inode->i_mapping->a_ops = &hugetlbfs_aops;
1038		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
1039		inode->i_mapping->private_data = resv_map;
1040		info->seals = F_SEAL_SEAL;
1041		switch (mode & S_IFMT) {
1042		default:
1043			init_special_inode(inode, mode, dev);
1044			break;
1045		case S_IFREG:
1046			inode->i_op = &hugetlbfs_inode_operations;
1047			inode->i_fop = &hugetlbfs_file_operations;
1048			break;
1049		case S_IFDIR:
1050			inode->i_op = &hugetlbfs_dir_inode_operations;
1051			inode->i_fop = &simple_dir_operations;
1052
1053			/* directory inodes start off with i_nlink == 2 (for "." entry) */
1054			inc_nlink(inode);
1055			break;
1056		case S_IFLNK:
1057			inode->i_op = &page_symlink_inode_operations;
1058			inode_nohighmem(inode);
1059			break;
1060		}
1061		lockdep_annotate_inode_mutex_key(inode);
1062	} else {
1063		if (resv_map)
1064			kref_put(&resv_map->refs, resv_map_release);
1065	}
1066
1067	return inode;
1068}
1069
1070/*
1071 * File creation. Allocate an inode, and we're done..
1072 */
1073static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
1074			   struct dentry *dentry, umode_t mode, dev_t dev)
1075{
1076	struct inode *inode;
1077
1078	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
1079	if (!inode)
1080		return -ENOSPC;
1081	dir->i_mtime = inode_set_ctime_current(dir);
1082	d_instantiate(dentry, inode);
1083	dget(dentry);/* Extra count - pin the dentry in core */
1084	return 0;
1085}
1086
1087static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
1088			   struct dentry *dentry, umode_t mode)
1089{
1090	int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry,
1091				     mode | S_IFDIR, 0);
1092	if (!retval)
1093		inc_nlink(dir);
1094	return retval;
1095}
1096
1097static int hugetlbfs_create(struct mnt_idmap *idmap,
1098			    struct inode *dir, struct dentry *dentry,
1099			    umode_t mode, bool excl)
1100{
1101	return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0);
1102}
1103
1104static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
1105			     struct inode *dir, struct file *file,
1106			     umode_t mode)
1107{
1108	struct inode *inode;
1109
1110	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
1111	if (!inode)
1112		return -ENOSPC;
1113	dir->i_mtime = inode_set_ctime_current(dir);
1114	d_tmpfile(file, inode);
1115	return finish_open_simple(file, 0);
1116}
1117
1118static int hugetlbfs_symlink(struct mnt_idmap *idmap,
1119			     struct inode *dir, struct dentry *dentry,
1120			     const char *symname)
1121{
1122	struct inode *inode;
1123	int error = -ENOSPC;
1124
1125	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
1126	if (inode) {
1127		int l = strlen(symname)+1;
1128		error = page_symlink(inode, symname, l);
1129		if (!error) {
1130			d_instantiate(dentry, inode);
1131			dget(dentry);
1132		} else
1133			iput(inode);
1134	}
1135	dir->i_mtime = inode_set_ctime_current(dir);
1136
1137	return error;
1138}
1139
1140#ifdef CONFIG_MIGRATION
1141static int hugetlbfs_migrate_folio(struct address_space *mapping,
1142				struct folio *dst, struct folio *src,
1143				enum migrate_mode mode)
1144{
1145	int rc;
1146
1147	rc = migrate_huge_page_move_mapping(mapping, dst, src);
1148	if (rc != MIGRATEPAGE_SUCCESS)
1149		return rc;
1150
1151	if (hugetlb_folio_subpool(src)) {
1152		hugetlb_set_folio_subpool(dst,
1153					hugetlb_folio_subpool(src));
1154		hugetlb_set_folio_subpool(src, NULL);
1155	}
1156
1157	if (mode != MIGRATE_SYNC_NO_COPY)
1158		folio_migrate_copy(dst, src);
1159	else
1160		folio_migrate_flags(dst, src);
1161
1162	return MIGRATEPAGE_SUCCESS;
1163}
1164#else
1165#define hugetlbfs_migrate_folio NULL
1166#endif
1167
1168static int hugetlbfs_error_remove_page(struct address_space *mapping,
1169				struct page *page)
1170{
1171	return 0;
1172}
1173
1174/*
1175 * Display the mount options in /proc/mounts.
1176 */
1177static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root)
1178{
1179	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb);
1180	struct hugepage_subpool *spool = sbinfo->spool;
1181	unsigned long hpage_size = huge_page_size(sbinfo->hstate);
1182	unsigned hpage_shift = huge_page_shift(sbinfo->hstate);
1183	char mod;
1184
1185	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
1186		seq_printf(m, ",uid=%u",
1187			   from_kuid_munged(&init_user_ns, sbinfo->uid));
1188	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
1189		seq_printf(m, ",gid=%u",
1190			   from_kgid_munged(&init_user_ns, sbinfo->gid));
1191	if (sbinfo->mode != 0755)
1192		seq_printf(m, ",mode=%o", sbinfo->mode);
1193	if (sbinfo->max_inodes != -1)
1194		seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes);
1195
1196	hpage_size /= 1024;
1197	mod = 'K';
1198	if (hpage_size >= 1024) {
1199		hpage_size /= 1024;
1200		mod = 'M';
1201	}
1202	seq_printf(m, ",pagesize=%lu%c", hpage_size, mod);
1203	if (spool) {
1204		if (spool->max_hpages != -1)
1205			seq_printf(m, ",size=%llu",
1206				   (unsigned long long)spool->max_hpages << hpage_shift);
1207		if (spool->min_hpages != -1)
1208			seq_printf(m, ",min_size=%llu",
1209				   (unsigned long long)spool->min_hpages << hpage_shift);
1210	}
1211	return 0;
1212}
1213
1214static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1215{
1216	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
1217	struct hstate *h = hstate_inode(d_inode(dentry));
1218
1219	buf->f_type = HUGETLBFS_MAGIC;
1220	buf->f_bsize = huge_page_size(h);
1221	if (sbinfo) {
1222		spin_lock(&sbinfo->stat_lock);
1223		/* If no limits set, just report 0 or -1 for max/free/used
1224		 * blocks, like simple_statfs() */
1225		if (sbinfo->spool) {
1226			long free_pages;
1227
1228			spin_lock_irq(&sbinfo->spool->lock);
1229			buf->f_blocks = sbinfo->spool->max_hpages;
1230			free_pages = sbinfo->spool->max_hpages
1231				- sbinfo->spool->used_hpages;
1232			buf->f_bavail = buf->f_bfree = free_pages;
1233			spin_unlock_irq(&sbinfo->spool->lock);
1234			buf->f_files = sbinfo->max_inodes;
1235			buf->f_ffree = sbinfo->free_inodes;
1236		}
1237		spin_unlock(&sbinfo->stat_lock);
1238	}
1239	buf->f_namelen = NAME_MAX;
1240	return 0;
1241}
1242
1243static void hugetlbfs_put_super(struct super_block *sb)
1244{
1245	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
1246
1247	if (sbi) {
1248		sb->s_fs_info = NULL;
1249
1250		if (sbi->spool)
1251			hugepage_put_subpool(sbi->spool);
1252
1253		kfree(sbi);
1254	}
1255}
1256
1257static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
1258{
1259	if (sbinfo->free_inodes >= 0) {
1260		spin_lock(&sbinfo->stat_lock);
1261		if (unlikely(!sbinfo->free_inodes)) {
1262			spin_unlock(&sbinfo->stat_lock);
1263			return 0;
1264		}
1265		sbinfo->free_inodes--;
1266		spin_unlock(&sbinfo->stat_lock);
1267	}
1268
1269	return 1;
1270}
1271
1272static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
1273{
1274	if (sbinfo->free_inodes >= 0) {
1275		spin_lock(&sbinfo->stat_lock);
1276		sbinfo->free_inodes++;
1277		spin_unlock(&sbinfo->stat_lock);
1278	}
1279}
1280
1281
1282static struct kmem_cache *hugetlbfs_inode_cachep;
1283
1284static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
1285{
1286	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
1287	struct hugetlbfs_inode_info *p;
1288
1289	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
1290		return NULL;
1291	p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
1292	if (unlikely(!p)) {
1293		hugetlbfs_inc_free_inodes(sbinfo);
1294		return NULL;
1295	}
1296
1297	/*
1298	 * Any time after allocation, hugetlbfs_destroy_inode can be called
1299	 * for the inode.  mpol_free_shared_policy is unconditionally called
1300	 * as part of hugetlbfs_destroy_inode.  So, initialize policy here
1301	 * in case of a quick call to destroy.
1302	 *
1303	 * Note that the policy is initialized even if we are creating a
1304	 * private inode.  This simplifies hugetlbfs_destroy_inode.
1305	 */
1306	mpol_shared_policy_init(&p->policy, NULL);
1307
1308	return &p->vfs_inode;
1309}
1310
1311static void hugetlbfs_free_inode(struct inode *inode)
1312{
1313	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
1314}
1315
1316static void hugetlbfs_destroy_inode(struct inode *inode)
1317{
1318	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
1319	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
1320}
1321
1322static const struct address_space_operations hugetlbfs_aops = {
1323	.write_begin	= hugetlbfs_write_begin,
1324	.write_end	= hugetlbfs_write_end,
1325	.dirty_folio	= noop_dirty_folio,
1326	.migrate_folio  = hugetlbfs_migrate_folio,
1327	.error_remove_page	= hugetlbfs_error_remove_page,
1328};
1329
1330
1331static void init_once(void *foo)
1332{
1333	struct hugetlbfs_inode_info *ei = foo;
1334
1335	inode_init_once(&ei->vfs_inode);
1336}
1337
1338const struct file_operations hugetlbfs_file_operations = {
1339	.read_iter		= hugetlbfs_read_iter,
1340	.mmap			= hugetlbfs_file_mmap,
1341	.fsync			= noop_fsync,
1342	.get_unmapped_area	= hugetlb_get_unmapped_area,
1343	.llseek			= default_llseek,
1344	.fallocate		= hugetlbfs_fallocate,
1345};
1346
1347static const struct inode_operations hugetlbfs_dir_inode_operations = {
1348	.create		= hugetlbfs_create,
1349	.lookup		= simple_lookup,
1350	.link		= simple_link,
1351	.unlink		= simple_unlink,
1352	.symlink	= hugetlbfs_symlink,
1353	.mkdir		= hugetlbfs_mkdir,
1354	.rmdir		= simple_rmdir,
1355	.mknod		= hugetlbfs_mknod,
1356	.rename		= simple_rename,
1357	.setattr	= hugetlbfs_setattr,
1358	.tmpfile	= hugetlbfs_tmpfile,
1359};
1360
1361static const struct inode_operations hugetlbfs_inode_operations = {
1362	.setattr	= hugetlbfs_setattr,
1363};
1364
1365static const struct super_operations hugetlbfs_ops = {
1366	.alloc_inode    = hugetlbfs_alloc_inode,
1367	.free_inode     = hugetlbfs_free_inode,
1368	.destroy_inode  = hugetlbfs_destroy_inode,
1369	.evict_inode	= hugetlbfs_evict_inode,
1370	.statfs		= hugetlbfs_statfs,
1371	.put_super	= hugetlbfs_put_super,
1372	.show_options	= hugetlbfs_show_options,
1373};
1374
1375/*
1376 * Convert size option passed from command line to number of huge pages
1377 * in the pool specified by hstate.  Size option could be in bytes
1378 * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
1379 */
1380static long
1381hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
1382			 enum hugetlbfs_size_type val_type)
1383{
1384	if (val_type == NO_SIZE)
1385		return -1;
1386
1387	if (val_type == SIZE_PERCENT) {
1388		size_opt <<= huge_page_shift(h);
1389		size_opt *= h->max_huge_pages;
1390		do_div(size_opt, 100);
1391	}
1392
1393	size_opt >>= huge_page_shift(h);
1394	return size_opt;
1395}
1396
1397/*
1398 * Parse one mount parameter.
1399 */
1400static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
1401{
1402	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1403	struct fs_parse_result result;
1404	struct hstate *h;
1405	char *rest;
1406	unsigned long ps;
1407	int opt;
1408
1409	opt = fs_parse(fc, hugetlb_fs_parameters, param, &result);
1410	if (opt < 0)
1411		return opt;
1412
1413	switch (opt) {
1414	case Opt_uid:
1415		ctx->uid = make_kuid(current_user_ns(), result.uint_32);
1416		if (!uid_valid(ctx->uid))
1417			goto bad_val;
1418		return 0;
1419
1420	case Opt_gid:
1421		ctx->gid = make_kgid(current_user_ns(), result.uint_32);
1422		if (!gid_valid(ctx->gid))
1423			goto bad_val;
1424		return 0;
1425
1426	case Opt_mode:
1427		ctx->mode = result.uint_32 & 01777U;
1428		return 0;
1429
1430	case Opt_size:
1431		/* memparse() will accept a K/M/G without a digit */
1432		if (!param->string || !isdigit(param->string[0]))
1433			goto bad_val;
1434		ctx->max_size_opt = memparse(param->string, &rest);
1435		ctx->max_val_type = SIZE_STD;
1436		if (*rest == '%')
1437			ctx->max_val_type = SIZE_PERCENT;
1438		return 0;
1439
1440	case Opt_nr_inodes:
1441		/* memparse() will accept a K/M/G without a digit */
1442		if (!param->string || !isdigit(param->string[0]))
1443			goto bad_val;
1444		ctx->nr_inodes = memparse(param->string, &rest);
1445		return 0;
1446
1447	case Opt_pagesize:
1448		ps = memparse(param->string, &rest);
1449		h = size_to_hstate(ps);
1450		if (!h) {
1451			pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
1452			return -EINVAL;
1453		}
1454		ctx->hstate = h;
1455		return 0;
1456
1457	case Opt_min_size:
1458		/* memparse() will accept a K/M/G without a digit */
1459		if (!param->string || !isdigit(param->string[0]))
1460			goto bad_val;
1461		ctx->min_size_opt = memparse(param->string, &rest);
1462		ctx->min_val_type = SIZE_STD;
1463		if (*rest == '%')
1464			ctx->min_val_type = SIZE_PERCENT;
1465		return 0;
1466
1467	default:
1468		return -EINVAL;
1469	}
1470
1471bad_val:
1472	return invalfc(fc, "Bad value '%s' for mount option '%s'\n",
1473		      param->string, param->key);
1474}
1475
1476/*
1477 * Validate the parsed options.
1478 */
1479static int hugetlbfs_validate(struct fs_context *fc)
1480{
1481	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1482
1483	/*
1484	 * Use huge page pool size (in hstate) to convert the size
1485	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
1486	 */
1487	ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
1488						   ctx->max_size_opt,
1489						   ctx->max_val_type);
1490	ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
1491						   ctx->min_size_opt,
1492						   ctx->min_val_type);
1493
1494	/*
1495	 * If max_size was specified, then min_size must be smaller
1496	 */
1497	if (ctx->max_val_type > NO_SIZE &&
1498	    ctx->min_hpages > ctx->max_hpages) {
1499		pr_err("Minimum size can not be greater than maximum size\n");
1500		return -EINVAL;
1501	}
1502
1503	return 0;
1504}
1505
1506static int
1507hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
1508{
1509	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1510	struct hugetlbfs_sb_info *sbinfo;
1511
1512	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
1513	if (!sbinfo)
1514		return -ENOMEM;
1515	sb->s_fs_info = sbinfo;
1516	spin_lock_init(&sbinfo->stat_lock);
1517	sbinfo->hstate		= ctx->hstate;
1518	sbinfo->max_inodes	= ctx->nr_inodes;
1519	sbinfo->free_inodes	= ctx->nr_inodes;
1520	sbinfo->spool		= NULL;
1521	sbinfo->uid		= ctx->uid;
1522	sbinfo->gid		= ctx->gid;
1523	sbinfo->mode		= ctx->mode;
1524
1525	/*
1526	 * Allocate and initialize subpool if maximum or minimum size is
1527	 * specified.  Any needed reservations (for minimum size) are taken
1528	 * when the subpool is created.
1529	 */
1530	if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
1531		sbinfo->spool = hugepage_new_subpool(ctx->hstate,
1532						     ctx->max_hpages,
1533						     ctx->min_hpages);
1534		if (!sbinfo->spool)
1535			goto out_free;
1536	}
1537	sb->s_maxbytes = MAX_LFS_FILESIZE;
1538	sb->s_blocksize = huge_page_size(ctx->hstate);
1539	sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
1540	sb->s_magic = HUGETLBFS_MAGIC;
1541	sb->s_op = &hugetlbfs_ops;
1542	sb->s_time_gran = 1;
1543
1544	/*
1545	 * Due to the special and limited functionality of hugetlbfs, it does
1546	 * not work well as a stacking filesystem.
1547	 */
1548	sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
1549	sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
1550	if (!sb->s_root)
1551		goto out_free;
1552	return 0;
1553out_free:
1554	kfree(sbinfo->spool);
1555	kfree(sbinfo);
1556	return -ENOMEM;
1557}
1558
1559static int hugetlbfs_get_tree(struct fs_context *fc)
1560{
1561	int err = hugetlbfs_validate(fc);
1562	if (err)
1563		return err;
1564	return get_tree_nodev(fc, hugetlbfs_fill_super);
1565}
1566
1567static void hugetlbfs_fs_context_free(struct fs_context *fc)
1568{
1569	kfree(fc->fs_private);
1570}
1571
1572static const struct fs_context_operations hugetlbfs_fs_context_ops = {
1573	.free		= hugetlbfs_fs_context_free,
1574	.parse_param	= hugetlbfs_parse_param,
1575	.get_tree	= hugetlbfs_get_tree,
1576};
1577
1578static int hugetlbfs_init_fs_context(struct fs_context *fc)
1579{
1580	struct hugetlbfs_fs_context *ctx;
1581
1582	ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
1583	if (!ctx)
1584		return -ENOMEM;
1585
1586	ctx->max_hpages	= -1; /* No limit on size by default */
1587	ctx->nr_inodes	= -1; /* No limit on number of inodes by default */
1588	ctx->uid	= current_fsuid();
1589	ctx->gid	= current_fsgid();
1590	ctx->mode	= 0755;
1591	ctx->hstate	= &default_hstate;
1592	ctx->min_hpages	= -1; /* No default minimum size */
1593	ctx->max_val_type = NO_SIZE;
1594	ctx->min_val_type = NO_SIZE;
1595	fc->fs_private = ctx;
1596	fc->ops	= &hugetlbfs_fs_context_ops;
1597	return 0;
1598}
1599
1600static struct file_system_type hugetlbfs_fs_type = {
1601	.name			= "hugetlbfs",
1602	.init_fs_context	= hugetlbfs_init_fs_context,
1603	.parameters		= hugetlb_fs_parameters,
1604	.kill_sb		= kill_litter_super,
1605};
1606
1607static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
1608
1609static int can_do_hugetlb_shm(void)
1610{
1611	kgid_t shm_group;
1612	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
1613	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
1614}
1615
1616static int get_hstate_idx(int page_size_log)
1617{
1618	struct hstate *h = hstate_sizelog(page_size_log);
1619
1620	if (!h)
1621		return -1;
1622	return hstate_index(h);
1623}
1624
1625/*
1626 * Note that size should be aligned to proper hugepage size in caller side,
1627 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
1628 */
1629struct file *hugetlb_file_setup(const char *name, size_t size,
1630				vm_flags_t acctflag, int creat_flags,
1631				int page_size_log)
1632{
1633	struct inode *inode;
1634	struct vfsmount *mnt;
1635	int hstate_idx;
1636	struct file *file;
1637
1638	hstate_idx = get_hstate_idx(page_size_log);
1639	if (hstate_idx < 0)
1640		return ERR_PTR(-ENODEV);
1641
1642	mnt = hugetlbfs_vfsmount[hstate_idx];
1643	if (!mnt)
1644		return ERR_PTR(-ENOENT);
1645
1646	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
1647		struct ucounts *ucounts = current_ucounts();
1648
1649		if (user_shm_lock(size, ucounts)) {
1650			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
1651				current->comm, current->pid);
1652			user_shm_unlock(size, ucounts);
1653		}
1654		return ERR_PTR(-EPERM);
1655	}
1656
1657	file = ERR_PTR(-ENOSPC);
1658	inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
1659	if (!inode)
1660		goto out;
1661	if (creat_flags == HUGETLB_SHMFS_INODE)
1662		inode->i_flags |= S_PRIVATE;
1663
1664	inode->i_size = size;
1665	clear_nlink(inode);
1666
1667	if (!hugetlb_reserve_pages(inode, 0,
1668			size >> huge_page_shift(hstate_inode(inode)), NULL,
1669			acctflag))
1670		file = ERR_PTR(-ENOMEM);
1671	else
1672		file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
1673					&hugetlbfs_file_operations);
1674	if (!IS_ERR(file))
1675		return file;
1676
1677	iput(inode);
1678out:
1679	return file;
1680}
1681
1682static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
1683{
1684	struct fs_context *fc;
1685	struct vfsmount *mnt;
1686
1687	fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
1688	if (IS_ERR(fc)) {
1689		mnt = ERR_CAST(fc);
1690	} else {
1691		struct hugetlbfs_fs_context *ctx = fc->fs_private;
1692		ctx->hstate = h;
1693		mnt = fc_mount(fc);
1694		put_fs_context(fc);
1695	}
1696	if (IS_ERR(mnt))
1697		pr_err("Cannot mount internal hugetlbfs for page size %luK",
1698		       huge_page_size(h) / SZ_1K);
1699	return mnt;
1700}
1701
1702static int __init init_hugetlbfs_fs(void)
1703{
1704	struct vfsmount *mnt;
1705	struct hstate *h;
1706	int error;
1707	int i;
1708
1709	if (!hugepages_supported()) {
1710		pr_info("disabling because there are no supported hugepage sizes\n");
1711		return -ENOTSUPP;
1712	}
1713
1714	error = -ENOMEM;
1715	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1716					sizeof(struct hugetlbfs_inode_info),
1717					0, SLAB_ACCOUNT, init_once);
1718	if (hugetlbfs_inode_cachep == NULL)
1719		goto out;
1720
1721	error = register_filesystem(&hugetlbfs_fs_type);
1722	if (error)
1723		goto out_free;
1724
1725	/* default hstate mount is required */
1726	mnt = mount_one_hugetlbfs(&default_hstate);
1727	if (IS_ERR(mnt)) {
1728		error = PTR_ERR(mnt);
1729		goto out_unreg;
1730	}
1731	hugetlbfs_vfsmount[default_hstate_idx] = mnt;
1732
1733	/* other hstates are optional */
1734	i = 0;
1735	for_each_hstate(h) {
1736		if (i == default_hstate_idx) {
1737			i++;
1738			continue;
1739		}
1740
1741		mnt = mount_one_hugetlbfs(h);
1742		if (IS_ERR(mnt))
1743			hugetlbfs_vfsmount[i] = NULL;
1744		else
1745			hugetlbfs_vfsmount[i] = mnt;
1746		i++;
1747	}
1748
1749	return 0;
1750
1751 out_unreg:
1752	(void)unregister_filesystem(&hugetlbfs_fs_type);
1753 out_free:
1754	kmem_cache_destroy(hugetlbfs_inode_cachep);
1755 out:
1756	return error;
1757}
1758fs_initcall(init_hugetlbfs_fs)
1759