xref: /kernel/linux/linux-6.6/fs/xfs/scrub/xfile.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
12#include "xfs_mount.h"
13#include "scrub/xfile.h"
14#include "scrub/xfarray.h"
15#include "scrub/scrub.h"
16#include "scrub/trace.h"
17#include <linux/shmem_fs.h>
18
19/*
20 * Swappable Temporary Memory
21 * ==========================
22 *
23 * Online checking sometimes needs to be able to stage a large amount of data
24 * in memory.  This information might not fit in the available memory and it
25 * doesn't all need to be accessible at all times.  In other words, we want an
26 * indexed data buffer to store data that can be paged out.
27 *
28 * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
29 * requirements.  Therefore, the xfile mechanism uses an unlinked shmem file to
30 * store our staging data.  This file is not installed in the file descriptor
31 * table so that user programs cannot access the data, which means that the
32 * xfile must be freed with xfile_destroy.
33 *
34 * xfiles assume that the caller will handle all required concurrency
35 * management; standard vfs locks (freezer and inode) are not taken.  Reads
36 * and writes are satisfied directly from the page cache.
37 *
38 * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
39 * of a hole cause a page to be mapped into the file.  If you are going to
40 * create a sparse xfile, please be careful about reading from uninitialized
41 * parts of the file.  These pages are !Uptodate and will eventually be
42 * reclaimed if not written, but in the short term this boosts memory
43 * consumption.
44 */
45
46/*
47 * xfiles must not be exposed to userspace and require upper layers to
48 * coordinate access to the one handle returned by the constructor, so
49 * establish a separate lock class for xfiles to avoid confusing lockdep.
50 */
51static struct lock_class_key xfile_i_mutex_key;
52
53/*
54 * Create an xfile of the given size.  The description will be used in the
55 * trace output.
56 */
57int
58xfile_create(
59	const char		*description,
60	loff_t			isize,
61	struct xfile		**xfilep)
62{
63	struct inode		*inode;
64	struct xfile		*xf;
65	int			error = -ENOMEM;
66
67	xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
68	if (!xf)
69		return -ENOMEM;
70
71	xf->file = shmem_file_setup(description, isize, 0);
72	if (!xf->file)
73		goto out_xfile;
74	if (IS_ERR(xf->file)) {
75		error = PTR_ERR(xf->file);
76		goto out_xfile;
77	}
78
79	/*
80	 * We want a large sparse file that we can pread, pwrite, and seek.
81	 * xfile users are responsible for keeping the xfile hidden away from
82	 * all other callers, so we skip timestamp updates and security checks.
83	 * Make the inode only accessible by root, just in case the xfile ever
84	 * escapes.
85	 */
86	xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
87			    FMODE_LSEEK;
88	xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
89	inode = file_inode(xf->file);
90	inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
91	inode->i_mode &= ~0177;
92	inode->i_uid = GLOBAL_ROOT_UID;
93	inode->i_gid = GLOBAL_ROOT_GID;
94
95	lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
96
97	trace_xfile_create(xf);
98
99	*xfilep = xf;
100	return 0;
101out_xfile:
102	kfree(xf);
103	return error;
104}
105
106/* Close the file and release all resources. */
107void
108xfile_destroy(
109	struct xfile		*xf)
110{
111	struct inode		*inode = file_inode(xf->file);
112
113	trace_xfile_destroy(xf);
114
115	lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
116	fput(xf->file);
117	kfree(xf);
118}
119
120/*
121 * Read a memory object directly from the xfile's page cache.  Unlike regular
122 * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
123 * high an offset, instead of truncating the read.  Otherwise, we return
124 * bytes read or an error code, like regular pread.
125 */
126ssize_t
127xfile_pread(
128	struct xfile		*xf,
129	void			*buf,
130	size_t			count,
131	loff_t			pos)
132{
133	struct inode		*inode = file_inode(xf->file);
134	struct address_space	*mapping = inode->i_mapping;
135	struct page		*page = NULL;
136	ssize_t			read = 0;
137	unsigned int		pflags;
138	int			error = 0;
139
140	if (count > MAX_RW_COUNT)
141		return -E2BIG;
142	if (inode->i_sb->s_maxbytes - pos < count)
143		return -EFBIG;
144
145	trace_xfile_pread(xf, pos, count);
146
147	pflags = memalloc_nofs_save();
148	while (count > 0) {
149		void		*p, *kaddr;
150		unsigned int	len;
151
152		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
153
154		/*
155		 * In-kernel reads of a shmem file cause it to allocate a page
156		 * if the mapping shows a hole.  Therefore, if we hit ENOMEM
157		 * we can continue by zeroing the caller's buffer.
158		 */
159		page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
160				__GFP_NOWARN);
161		if (IS_ERR(page)) {
162			error = PTR_ERR(page);
163			if (error != -ENOMEM)
164				break;
165
166			memset(buf, 0, len);
167			goto advance;
168		}
169
170		if (PageUptodate(page)) {
171			/*
172			 * xfile pages must never be mapped into userspace, so
173			 * we skip the dcache flush.
174			 */
175			kaddr = kmap_local_page(page);
176			p = kaddr + offset_in_page(pos);
177			memcpy(buf, p, len);
178			kunmap_local(kaddr);
179		} else {
180			memset(buf, 0, len);
181		}
182		put_page(page);
183
184advance:
185		count -= len;
186		pos += len;
187		buf += len;
188		read += len;
189	}
190	memalloc_nofs_restore(pflags);
191
192	if (read > 0)
193		return read;
194	return error;
195}
196
197/*
198 * Write a memory object directly to the xfile's page cache.  Unlike regular
199 * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
200 * high an offset, instead of truncating the write.  Otherwise, we return
201 * bytes written or an error code, like regular pwrite.
202 */
203ssize_t
204xfile_pwrite(
205	struct xfile		*xf,
206	const void		*buf,
207	size_t			count,
208	loff_t			pos)
209{
210	struct inode		*inode = file_inode(xf->file);
211	struct address_space	*mapping = inode->i_mapping;
212	const struct address_space_operations *aops = mapping->a_ops;
213	struct page		*page = NULL;
214	ssize_t			written = 0;
215	unsigned int		pflags;
216	int			error = 0;
217
218	if (count > MAX_RW_COUNT)
219		return -E2BIG;
220	if (inode->i_sb->s_maxbytes - pos < count)
221		return -EFBIG;
222
223	trace_xfile_pwrite(xf, pos, count);
224
225	pflags = memalloc_nofs_save();
226	while (count > 0) {
227		void		*fsdata = NULL;
228		void		*p, *kaddr;
229		unsigned int	len;
230		int		ret;
231
232		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
233
234		/*
235		 * We call write_begin directly here to avoid all the freezer
236		 * protection lock-taking that happens in the normal path.
237		 * shmem doesn't support fs freeze, but lockdep doesn't know
238		 * that and will trip over that.
239		 */
240		error = aops->write_begin(NULL, mapping, pos, len, &page,
241				&fsdata);
242		if (error)
243			break;
244
245		/*
246		 * xfile pages must never be mapped into userspace, so we skip
247		 * the dcache flush.  If the page is not uptodate, zero it
248		 * before writing data.
249		 */
250		kaddr = kmap_local_page(page);
251		if (!PageUptodate(page)) {
252			memset(kaddr, 0, PAGE_SIZE);
253			SetPageUptodate(page);
254		}
255		p = kaddr + offset_in_page(pos);
256		memcpy(p, buf, len);
257		kunmap_local(kaddr);
258
259		ret = aops->write_end(NULL, mapping, pos, len, len, page,
260				fsdata);
261		if (ret < 0) {
262			error = ret;
263			break;
264		}
265
266		written += ret;
267		if (ret != len)
268			break;
269
270		count -= ret;
271		pos += ret;
272		buf += ret;
273	}
274	memalloc_nofs_restore(pflags);
275
276	if (written > 0)
277		return written;
278	return error;
279}
280
281/* Find the next written area in the xfile data for a given offset. */
282loff_t
283xfile_seek_data(
284	struct xfile		*xf,
285	loff_t			pos)
286{
287	loff_t			ret;
288
289	ret = vfs_llseek(xf->file, pos, SEEK_DATA);
290	trace_xfile_seek_data(xf, pos, ret);
291	return ret;
292}
293
294/* Query stat information for an xfile. */
295int
296xfile_stat(
297	struct xfile		*xf,
298	struct xfile_stat	*statbuf)
299{
300	struct kstat		ks;
301	int			error;
302
303	error = vfs_getattr_nosec(&xf->file->f_path, &ks,
304			STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
305	if (error)
306		return error;
307
308	statbuf->size = ks.size;
309	statbuf->bytes = ks.blocks << SECTOR_SHIFT;
310	return 0;
311}
312
313/*
314 * Grab the (locked) page for a memory object.  The object cannot span a page
315 * boundary.  Returns 0 (and a locked page) if successful, -ENOTBLK if we
316 * cannot grab the page, or the usual negative errno.
317 */
318int
319xfile_get_page(
320	struct xfile		*xf,
321	loff_t			pos,
322	unsigned int		len,
323	struct xfile_page	*xfpage)
324{
325	struct inode		*inode = file_inode(xf->file);
326	struct address_space	*mapping = inode->i_mapping;
327	const struct address_space_operations *aops = mapping->a_ops;
328	struct page		*page = NULL;
329	void			*fsdata = NULL;
330	loff_t			key = round_down(pos, PAGE_SIZE);
331	unsigned int		pflags;
332	int			error;
333
334	if (inode->i_sb->s_maxbytes - pos < len)
335		return -ENOMEM;
336	if (len > PAGE_SIZE - offset_in_page(pos))
337		return -ENOTBLK;
338
339	trace_xfile_get_page(xf, pos, len);
340
341	pflags = memalloc_nofs_save();
342
343	/*
344	 * We call write_begin directly here to avoid all the freezer
345	 * protection lock-taking that happens in the normal path.  shmem
346	 * doesn't support fs freeze, but lockdep doesn't know that and will
347	 * trip over that.
348	 */
349	error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
350			&fsdata);
351	if (error)
352		goto out_pflags;
353
354	/* We got the page, so make sure we push out EOF. */
355	if (i_size_read(inode) < pos + len)
356		i_size_write(inode, pos + len);
357
358	/*
359	 * If the page isn't up to date, fill it with zeroes before we hand it
360	 * to the caller and make sure the backing store will hold on to them.
361	 */
362	if (!PageUptodate(page)) {
363		void	*kaddr;
364
365		kaddr = kmap_local_page(page);
366		memset(kaddr, 0, PAGE_SIZE);
367		kunmap_local(kaddr);
368		SetPageUptodate(page);
369	}
370
371	/*
372	 * Mark each page dirty so that the contents are written to some
373	 * backing store when we drop this buffer, and take an extra reference
374	 * to prevent the xfile page from being swapped or removed from the
375	 * page cache by reclaim if the caller unlocks the page.
376	 */
377	set_page_dirty(page);
378	get_page(page);
379
380	xfpage->page = page;
381	xfpage->fsdata = fsdata;
382	xfpage->pos = key;
383out_pflags:
384	memalloc_nofs_restore(pflags);
385	return error;
386}
387
388/*
389 * Release the (locked) page for a memory object.  Returns 0 or a negative
390 * errno.
391 */
392int
393xfile_put_page(
394	struct xfile		*xf,
395	struct xfile_page	*xfpage)
396{
397	struct inode		*inode = file_inode(xf->file);
398	struct address_space	*mapping = inode->i_mapping;
399	const struct address_space_operations *aops = mapping->a_ops;
400	unsigned int		pflags;
401	int			ret;
402
403	trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
404
405	/* Give back the reference that we took in xfile_get_page. */
406	put_page(xfpage->page);
407
408	pflags = memalloc_nofs_save();
409	ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
410			xfpage->page, xfpage->fsdata);
411	memalloc_nofs_restore(pflags);
412	memset(xfpage, 0, sizeof(struct xfile_page));
413
414	if (ret < 0)
415		return ret;
416	if (ret != PAGE_SIZE)
417		return -EIO;
418	return 0;
419}
420