xref: /kernel/linux/linux-5.10/fs/zonefs/super.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Simple file system for zoned block devices exposing zones as files.
4 *
5 * Copyright (C) 2019 Western Digital Corporation or its affiliates.
6 */
7#include <linux/module.h>
8#include <linux/fs.h>
9#include <linux/magic.h>
10#include <linux/iomap.h>
11#include <linux/init.h>
12#include <linux/slab.h>
13#include <linux/blkdev.h>
14#include <linux/statfs.h>
15#include <linux/writeback.h>
16#include <linux/quotaops.h>
17#include <linux/seq_file.h>
18#include <linux/parser.h>
19#include <linux/uio.h>
20#include <linux/mman.h>
21#include <linux/sched/mm.h>
22#include <linux/crc32.h>
23#include <linux/task_io_accounting_ops.h>
24
25#include "zonefs.h"
26
27static inline int zonefs_zone_mgmt(struct inode *inode,
28				   enum req_opf op)
29{
30	struct zonefs_inode_info *zi = ZONEFS_I(inode);
31	int ret;
32
33	lockdep_assert_held(&zi->i_truncate_mutex);
34
35	/*
36	 * With ZNS drives, closing an explicitly open zone that has not been
37	 * written will change the zone state to "closed", that is, the zone
38	 * will remain active. Since this can then cause failure of explicit
39	 * open operation on other zones if the drive active zone resources
40	 * are exceeded, make sure that the zone does not remain active by
41	 * resetting it.
42	 */
43	if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset)
44		op = REQ_OP_ZONE_RESET;
45
46	ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
47			       zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
48	if (ret) {
49		zonefs_err(inode->i_sb,
50			   "Zone management operation %s at %llu failed %d\n",
51			   blk_op_str(op), zi->i_zsector, ret);
52		return ret;
53	}
54
55	return 0;
56}
57
58static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
59{
60	struct zonefs_inode_info *zi = ZONEFS_I(inode);
61
62	i_size_write(inode, isize);
63	/*
64	 * A full zone is no longer open/active and does not need
65	 * explicit closing.
66	 */
67	if (isize >= zi->i_max_size)
68		zi->i_flags &= ~ZONEFS_ZONE_OPEN;
69}
70
71static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
72				   loff_t length, unsigned int flags,
73				   struct iomap *iomap, struct iomap *srcmap)
74{
75	struct zonefs_inode_info *zi = ZONEFS_I(inode);
76	struct super_block *sb = inode->i_sb;
77	loff_t isize;
78
79	/*
80	 * All blocks are always mapped below EOF. If reading past EOF,
81	 * act as if there is a hole up to the file maximum size.
82	 */
83	mutex_lock(&zi->i_truncate_mutex);
84	iomap->bdev = inode->i_sb->s_bdev;
85	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
86	isize = i_size_read(inode);
87	if (iomap->offset >= isize) {
88		iomap->type = IOMAP_HOLE;
89		iomap->addr = IOMAP_NULL_ADDR;
90		iomap->length = length;
91	} else {
92		iomap->type = IOMAP_MAPPED;
93		iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
94		iomap->length = isize - iomap->offset;
95	}
96	mutex_unlock(&zi->i_truncate_mutex);
97
98	return 0;
99}
100
101static const struct iomap_ops zonefs_read_iomap_ops = {
102	.iomap_begin	= zonefs_read_iomap_begin,
103};
104
105static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
106				    loff_t length, unsigned int flags,
107				    struct iomap *iomap, struct iomap *srcmap)
108{
109	struct zonefs_inode_info *zi = ZONEFS_I(inode);
110	struct super_block *sb = inode->i_sb;
111	loff_t isize;
112
113	/* All write I/Os should always be within the file maximum size */
114	if (WARN_ON_ONCE(offset + length > zi->i_max_size))
115		return -EIO;
116
117	/*
118	 * Sequential zones can only accept direct writes. This is already
119	 * checked when writes are issued, so warn if we see a page writeback
120	 * operation.
121	 */
122	if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
123			 !(flags & IOMAP_DIRECT)))
124		return -EIO;
125
126	/*
127	 * For conventional zones, all blocks are always mapped. For sequential
128	 * zones, all blocks after always mapped below the inode size (zone
129	 * write pointer) and unwriten beyond.
130	 */
131	mutex_lock(&zi->i_truncate_mutex);
132	iomap->bdev = inode->i_sb->s_bdev;
133	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
134	iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
135	isize = i_size_read(inode);
136	if (iomap->offset >= isize) {
137		iomap->type = IOMAP_UNWRITTEN;
138		iomap->length = zi->i_max_size - iomap->offset;
139	} else {
140		iomap->type = IOMAP_MAPPED;
141		iomap->length = isize - iomap->offset;
142	}
143	mutex_unlock(&zi->i_truncate_mutex);
144
145	return 0;
146}
147
148static const struct iomap_ops zonefs_write_iomap_ops = {
149	.iomap_begin	= zonefs_write_iomap_begin,
150};
151
152static int zonefs_readpage(struct file *unused, struct page *page)
153{
154	return iomap_readpage(page, &zonefs_read_iomap_ops);
155}
156
157static void zonefs_readahead(struct readahead_control *rac)
158{
159	iomap_readahead(rac, &zonefs_read_iomap_ops);
160}
161
162/*
163 * Map blocks for page writeback. This is used only on conventional zone files,
164 * which implies that the page range can only be within the fixed inode size.
165 */
166static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
167				   struct inode *inode, loff_t offset)
168{
169	struct zonefs_inode_info *zi = ZONEFS_I(inode);
170
171	if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
172		return -EIO;
173	if (WARN_ON_ONCE(offset >= i_size_read(inode)))
174		return -EIO;
175
176	/* If the mapping is already OK, nothing needs to be done */
177	if (offset >= wpc->iomap.offset &&
178	    offset < wpc->iomap.offset + wpc->iomap.length)
179		return 0;
180
181	return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
182					IOMAP_WRITE, &wpc->iomap, NULL);
183}
184
185static const struct iomap_writeback_ops zonefs_writeback_ops = {
186	.map_blocks		= zonefs_write_map_blocks,
187};
188
189static int zonefs_writepage(struct page *page, struct writeback_control *wbc)
190{
191	struct iomap_writepage_ctx wpc = { };
192
193	return iomap_writepage(page, wbc, &wpc, &zonefs_writeback_ops);
194}
195
196static int zonefs_writepages(struct address_space *mapping,
197			     struct writeback_control *wbc)
198{
199	struct iomap_writepage_ctx wpc = { };
200
201	return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
202}
203
204static int zonefs_swap_activate(struct swap_info_struct *sis,
205				struct file *swap_file, sector_t *span)
206{
207	struct inode *inode = file_inode(swap_file);
208	struct zonefs_inode_info *zi = ZONEFS_I(inode);
209
210	if (zi->i_ztype != ZONEFS_ZTYPE_CNV) {
211		zonefs_err(inode->i_sb,
212			   "swap file: not a conventional zone file\n");
213		return -EINVAL;
214	}
215
216	return iomap_swapfile_activate(sis, swap_file, span,
217				       &zonefs_read_iomap_ops);
218}
219
220static const struct address_space_operations zonefs_file_aops = {
221	.readpage		= zonefs_readpage,
222	.readahead		= zonefs_readahead,
223	.writepage		= zonefs_writepage,
224	.writepages		= zonefs_writepages,
225	.set_page_dirty		= iomap_set_page_dirty,
226	.releasepage		= iomap_releasepage,
227	.invalidatepage		= iomap_invalidatepage,
228	.migratepage		= iomap_migrate_page,
229	.is_partially_uptodate	= iomap_is_partially_uptodate,
230	.error_remove_page	= generic_error_remove_page,
231	.direct_IO		= noop_direct_IO,
232	.swap_activate		= zonefs_swap_activate,
233};
234
235static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
236{
237	struct super_block *sb = inode->i_sb;
238	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
239	loff_t old_isize = i_size_read(inode);
240	loff_t nr_blocks;
241
242	if (new_isize == old_isize)
243		return;
244
245	spin_lock(&sbi->s_lock);
246
247	/*
248	 * This may be called for an update after an IO error.
249	 * So beware of the values seen.
250	 */
251	if (new_isize < old_isize) {
252		nr_blocks = (old_isize - new_isize) >> sb->s_blocksize_bits;
253		if (sbi->s_used_blocks > nr_blocks)
254			sbi->s_used_blocks -= nr_blocks;
255		else
256			sbi->s_used_blocks = 0;
257	} else {
258		sbi->s_used_blocks +=
259			(new_isize - old_isize) >> sb->s_blocksize_bits;
260		if (sbi->s_used_blocks > sbi->s_blocks)
261			sbi->s_used_blocks = sbi->s_blocks;
262	}
263
264	spin_unlock(&sbi->s_lock);
265}
266
267/*
268 * Check a zone condition and adjust its file inode access permissions for
269 * offline and readonly zones. Return the inode size corresponding to the
270 * amount of readable data in the zone.
271 */
272static loff_t zonefs_check_zone_condition(struct inode *inode,
273					  struct blk_zone *zone, bool warn,
274					  bool mount)
275{
276	struct zonefs_inode_info *zi = ZONEFS_I(inode);
277
278	switch (zone->cond) {
279	case BLK_ZONE_COND_OFFLINE:
280		/*
281		 * Dead zone: make the inode immutable, disable all accesses
282		 * and set the file size to 0 (zone wp set to zone start).
283		 */
284		if (warn)
285			zonefs_warn(inode->i_sb, "inode %lu: offline zone\n",
286				    inode->i_ino);
287		inode->i_flags |= S_IMMUTABLE;
288		inode->i_mode &= ~0777;
289		zone->wp = zone->start;
290		return 0;
291	case BLK_ZONE_COND_READONLY:
292		/*
293		 * The write pointer of read-only zones is invalid. If such a
294		 * zone is found during mount, the file size cannot be retrieved
295		 * so we treat the zone as offline (mount == true case).
296		 * Otherwise, keep the file size as it was when last updated
297		 * so that the user can recover data. In both cases, writes are
298		 * always disabled for the zone.
299		 */
300		if (warn)
301			zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n",
302				    inode->i_ino);
303		inode->i_flags |= S_IMMUTABLE;
304		if (mount) {
305			zone->cond = BLK_ZONE_COND_OFFLINE;
306			inode->i_mode &= ~0777;
307			zone->wp = zone->start;
308			return 0;
309		}
310		inode->i_mode &= ~0222;
311		return i_size_read(inode);
312	case BLK_ZONE_COND_FULL:
313		/* The write pointer of full zones is invalid. */
314		return zi->i_max_size;
315	default:
316		if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
317			return zi->i_max_size;
318		return (zone->wp - zone->start) << SECTOR_SHIFT;
319	}
320}
321
322struct zonefs_ioerr_data {
323	struct inode	*inode;
324	bool		write;
325};
326
327static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
328			      void *data)
329{
330	struct zonefs_ioerr_data *err = data;
331	struct inode *inode = err->inode;
332	struct zonefs_inode_info *zi = ZONEFS_I(inode);
333	struct super_block *sb = inode->i_sb;
334	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
335	loff_t isize, data_size;
336
337	/*
338	 * Check the zone condition: if the zone is not "bad" (offline or
339	 * read-only), read errors are simply signaled to the IO issuer as long
340	 * as there is no inconsistency between the inode size and the amount of
341	 * data writen in the zone (data_size).
342	 */
343	data_size = zonefs_check_zone_condition(inode, zone, true, false);
344	isize = i_size_read(inode);
345	if (zone->cond != BLK_ZONE_COND_OFFLINE &&
346	    zone->cond != BLK_ZONE_COND_READONLY &&
347	    !err->write && isize == data_size)
348		return 0;
349
350	/*
351	 * At this point, we detected either a bad zone or an inconsistency
352	 * between the inode size and the amount of data written in the zone.
353	 * For the latter case, the cause may be a write IO error or an external
354	 * action on the device. Two error patterns exist:
355	 * 1) The inode size is lower than the amount of data in the zone:
356	 *    a write operation partially failed and data was writen at the end
357	 *    of the file. This can happen in the case of a large direct IO
358	 *    needing several BIOs and/or write requests to be processed.
359	 * 2) The inode size is larger than the amount of data in the zone:
360	 *    this can happen with a deferred write error with the use of the
361	 *    device side write cache after getting successful write IO
362	 *    completions. Other possibilities are (a) an external corruption,
363	 *    e.g. an application reset the zone directly, or (b) the device
364	 *    has a serious problem (e.g. firmware bug).
365	 *
366	 * In all cases, warn about inode size inconsistency and handle the
367	 * IO error according to the zone condition and to the mount options.
368	 */
369	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size)
370		zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n",
371			    inode->i_ino, isize, data_size);
372
373	/*
374	 * First handle bad zones signaled by hardware. The mount options
375	 * errors=zone-ro and errors=zone-offline result in changing the
376	 * zone condition to read-only and offline respectively, as if the
377	 * condition was signaled by the hardware.
378	 */
379	if (zone->cond == BLK_ZONE_COND_OFFLINE ||
380	    sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) {
381		zonefs_warn(sb, "inode %lu: read/write access disabled\n",
382			    inode->i_ino);
383		if (zone->cond != BLK_ZONE_COND_OFFLINE) {
384			zone->cond = BLK_ZONE_COND_OFFLINE;
385			data_size = zonefs_check_zone_condition(inode, zone,
386								false, false);
387		}
388	} else if (zone->cond == BLK_ZONE_COND_READONLY ||
389		   sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) {
390		zonefs_warn(sb, "inode %lu: write access disabled\n",
391			    inode->i_ino);
392		if (zone->cond != BLK_ZONE_COND_READONLY) {
393			zone->cond = BLK_ZONE_COND_READONLY;
394			data_size = zonefs_check_zone_condition(inode, zone,
395								false, false);
396		}
397	} else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO &&
398		   data_size > isize) {
399		/* Do not expose garbage data */
400		data_size = isize;
401	}
402
403	/*
404	 * If the filesystem is mounted with the explicit-open mount option, we
405	 * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to
406	 * the read-only or offline condition, to avoid attempting an explicit
407	 * close of the zone when the inode file is closed.
408	 */
409	if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) &&
410	    (zone->cond == BLK_ZONE_COND_OFFLINE ||
411	     zone->cond == BLK_ZONE_COND_READONLY))
412		zi->i_flags &= ~ZONEFS_ZONE_OPEN;
413
414	/*
415	 * If error=remount-ro was specified, any error result in remounting
416	 * the volume as read-only.
417	 */
418	if ((sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) && !sb_rdonly(sb)) {
419		zonefs_warn(sb, "remounting filesystem read-only\n");
420		sb->s_flags |= SB_RDONLY;
421	}
422
423	/*
424	 * Update block usage stats and the inode size  to prevent access to
425	 * invalid data.
426	 */
427	zonefs_update_stats(inode, data_size);
428	zonefs_i_size_write(inode, data_size);
429	zi->i_wpoffset = data_size;
430
431	return 0;
432}
433
434/*
435 * When an file IO error occurs, check the file zone to see if there is a change
436 * in the zone condition (e.g. offline or read-only). For a failed write to a
437 * sequential zone, the zone write pointer position must also be checked to
438 * eventually correct the file size and zonefs inode write pointer offset
439 * (which can be out of sync with the drive due to partial write failures).
440 */
441static void __zonefs_io_error(struct inode *inode, bool write)
442{
443	struct zonefs_inode_info *zi = ZONEFS_I(inode);
444	struct super_block *sb = inode->i_sb;
445	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
446	unsigned int noio_flag;
447	unsigned int nr_zones = 1;
448	struct zonefs_ioerr_data err = {
449		.inode = inode,
450		.write = write,
451	};
452	int ret;
453
454	/*
455	 * The only files that have more than one zone are conventional zone
456	 * files with aggregated conventional zones, for which the inode zone
457	 * size is always larger than the device zone size.
458	 */
459	if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev))
460		nr_zones = zi->i_zone_size >>
461			(sbi->s_zone_sectors_shift + SECTOR_SHIFT);
462
463	/*
464	 * Memory allocations in blkdev_report_zones() can trigger a memory
465	 * reclaim which may in turn cause a recursion into zonefs as well as
466	 * struct request allocations for the same device. The former case may
467	 * end up in a deadlock on the inode truncate mutex, while the latter
468	 * may prevent IO forward progress. Executing the report zones under
469	 * the GFP_NOIO context avoids both problems.
470	 */
471	noio_flag = memalloc_noio_save();
472	ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones,
473				  zonefs_io_error_cb, &err);
474	if (ret != nr_zones)
475		zonefs_err(sb, "Get inode %lu zone information failed %d\n",
476			   inode->i_ino, ret);
477	memalloc_noio_restore(noio_flag);
478}
479
480static void zonefs_io_error(struct inode *inode, bool write)
481{
482	struct zonefs_inode_info *zi = ZONEFS_I(inode);
483
484	mutex_lock(&zi->i_truncate_mutex);
485	__zonefs_io_error(inode, write);
486	mutex_unlock(&zi->i_truncate_mutex);
487}
488
489static int zonefs_file_truncate(struct inode *inode, loff_t isize)
490{
491	struct zonefs_inode_info *zi = ZONEFS_I(inode);
492	loff_t old_isize;
493	enum req_opf op;
494	int ret = 0;
495
496	/*
497	 * Only sequential zone files can be truncated and truncation is allowed
498	 * only down to a 0 size, which is equivalent to a zone reset, and to
499	 * the maximum file size, which is equivalent to a zone finish.
500	 */
501	if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
502		return -EPERM;
503
504	if (!isize)
505		op = REQ_OP_ZONE_RESET;
506	else if (isize == zi->i_max_size)
507		op = REQ_OP_ZONE_FINISH;
508	else
509		return -EPERM;
510
511	inode_dio_wait(inode);
512
513	/* Serialize against page faults */
514	down_write(&zi->i_mmap_sem);
515
516	/* Serialize against zonefs_iomap_begin() */
517	mutex_lock(&zi->i_truncate_mutex);
518
519	old_isize = i_size_read(inode);
520	if (isize == old_isize)
521		goto unlock;
522
523	ret = zonefs_zone_mgmt(inode, op);
524	if (ret)
525		goto unlock;
526
527	/*
528	 * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
529	 * take care of open zones.
530	 */
531	if (zi->i_flags & ZONEFS_ZONE_OPEN) {
532		/*
533		 * Truncating a zone to EMPTY or FULL is the equivalent of
534		 * closing the zone. For a truncation to 0, we need to
535		 * re-open the zone to ensure new writes can be processed.
536		 * For a truncation to the maximum file size, the zone is
537		 * closed and writes cannot be accepted anymore, so clear
538		 * the open flag.
539		 */
540		if (!isize)
541			ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
542		else
543			zi->i_flags &= ~ZONEFS_ZONE_OPEN;
544	}
545
546	zonefs_update_stats(inode, isize);
547	truncate_setsize(inode, isize);
548	zi->i_wpoffset = isize;
549
550unlock:
551	mutex_unlock(&zi->i_truncate_mutex);
552	up_write(&zi->i_mmap_sem);
553
554	return ret;
555}
556
557static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr)
558{
559	struct inode *inode = d_inode(dentry);
560	int ret;
561
562	if (unlikely(IS_IMMUTABLE(inode)))
563		return -EPERM;
564
565	ret = setattr_prepare(dentry, iattr);
566	if (ret)
567		return ret;
568
569	/*
570	 * Since files and directories cannot be created nor deleted, do not
571	 * allow setting any write attributes on the sub-directories grouping
572	 * files by zone type.
573	 */
574	if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) &&
575	    (iattr->ia_mode & 0222))
576		return -EPERM;
577
578	if (((iattr->ia_valid & ATTR_UID) &&
579	     !uid_eq(iattr->ia_uid, inode->i_uid)) ||
580	    ((iattr->ia_valid & ATTR_GID) &&
581	     !gid_eq(iattr->ia_gid, inode->i_gid))) {
582		ret = dquot_transfer(inode, iattr);
583		if (ret)
584			return ret;
585	}
586
587	if (iattr->ia_valid & ATTR_SIZE) {
588		ret = zonefs_file_truncate(inode, iattr->ia_size);
589		if (ret)
590			return ret;
591	}
592
593	setattr_copy(inode, iattr);
594
595	return 0;
596}
597
598static const struct inode_operations zonefs_file_inode_operations = {
599	.setattr	= zonefs_inode_setattr,
600};
601
602static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
603			     int datasync)
604{
605	struct inode *inode = file_inode(file);
606	int ret = 0;
607
608	if (unlikely(IS_IMMUTABLE(inode)))
609		return -EPERM;
610
611	/*
612	 * Since only direct writes are allowed in sequential files, page cache
613	 * flush is needed only for conventional zone files.
614	 */
615	if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
616		ret = file_write_and_wait_range(file, start, end);
617	if (!ret)
618		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
619
620	if (ret)
621		zonefs_io_error(inode, true);
622
623	return ret;
624}
625
626static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf)
627{
628	struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file));
629	vm_fault_t ret;
630
631	down_read(&zi->i_mmap_sem);
632	ret = filemap_fault(vmf);
633	up_read(&zi->i_mmap_sem);
634
635	return ret;
636}
637
638static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
639{
640	struct inode *inode = file_inode(vmf->vma->vm_file);
641	struct zonefs_inode_info *zi = ZONEFS_I(inode);
642	vm_fault_t ret;
643
644	if (unlikely(IS_IMMUTABLE(inode)))
645		return VM_FAULT_SIGBUS;
646
647	/*
648	 * Sanity check: only conventional zone files can have shared
649	 * writeable mappings.
650	 */
651	if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
652		return VM_FAULT_NOPAGE;
653
654	sb_start_pagefault(inode->i_sb);
655	file_update_time(vmf->vma->vm_file);
656
657	/* Serialize against truncates */
658	down_read(&zi->i_mmap_sem);
659	ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
660	up_read(&zi->i_mmap_sem);
661
662	sb_end_pagefault(inode->i_sb);
663	return ret;
664}
665
666static const struct vm_operations_struct zonefs_file_vm_ops = {
667	.fault		= zonefs_filemap_fault,
668	.map_pages	= filemap_map_pages,
669	.page_mkwrite	= zonefs_filemap_page_mkwrite,
670};
671
672static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
673{
674	/*
675	 * Conventional zones accept random writes, so their files can support
676	 * shared writable mappings. For sequential zone files, only read
677	 * mappings are possible since there are no guarantees for write
678	 * ordering between msync() and page cache writeback.
679	 */
680	if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ &&
681	    (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
682		return -EINVAL;
683
684	file_accessed(file);
685	vma->vm_ops = &zonefs_file_vm_ops;
686
687	return 0;
688}
689
690static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
691{
692	loff_t isize = i_size_read(file_inode(file));
693
694	/*
695	 * Seeks are limited to below the zone size for conventional zones
696	 * and below the zone write pointer for sequential zones. In both
697	 * cases, this limit is the inode size.
698	 */
699	return generic_file_llseek_size(file, offset, whence, isize, isize);
700}
701
702static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
703					int error, unsigned int flags)
704{
705	struct inode *inode = file_inode(iocb->ki_filp);
706	struct zonefs_inode_info *zi = ZONEFS_I(inode);
707
708	if (error) {
709		zonefs_io_error(inode, true);
710		return error;
711	}
712
713	if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) {
714		/*
715		 * Note that we may be seeing completions out of order,
716		 * but that is not a problem since a write completed
717		 * successfully necessarily means that all preceding writes
718		 * were also successful. So we can safely increase the inode
719		 * size to the write end location.
720		 */
721		mutex_lock(&zi->i_truncate_mutex);
722		if (i_size_read(inode) < iocb->ki_pos + size) {
723			zonefs_update_stats(inode, iocb->ki_pos + size);
724			zonefs_i_size_write(inode, iocb->ki_pos + size);
725		}
726		mutex_unlock(&zi->i_truncate_mutex);
727	}
728
729	return 0;
730}
731
732static const struct iomap_dio_ops zonefs_write_dio_ops = {
733	.end_io			= zonefs_file_write_dio_end_io,
734};
735
736static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
737{
738	struct inode *inode = file_inode(iocb->ki_filp);
739	struct zonefs_inode_info *zi = ZONEFS_I(inode);
740	struct block_device *bdev = inode->i_sb->s_bdev;
741	unsigned int max;
742	struct bio *bio;
743	ssize_t size;
744	int nr_pages;
745	ssize_t ret;
746
747	max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
748	max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
749	iov_iter_truncate(from, max);
750
751	nr_pages = iov_iter_npages(from, BIO_MAX_PAGES);
752	if (!nr_pages)
753		return 0;
754
755	bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
756	if (!bio)
757		return -ENOMEM;
758
759	bio_set_dev(bio, bdev);
760	bio->bi_iter.bi_sector = zi->i_zsector;
761	bio->bi_write_hint = iocb->ki_hint;
762	bio->bi_ioprio = iocb->ki_ioprio;
763	bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
764	if (iocb->ki_flags & IOCB_DSYNC)
765		bio->bi_opf |= REQ_FUA;
766
767	ret = bio_iov_iter_get_pages(bio, from);
768	if (unlikely(ret))
769		goto out_release;
770
771	size = bio->bi_iter.bi_size;
772	task_io_account_write(size);
773
774	if (iocb->ki_flags & IOCB_HIPRI)
775		bio_set_polled(bio, iocb);
776
777	ret = submit_bio_wait(bio);
778
779	/*
780	 * If the file zone was written underneath the file system, the zone
781	 * write pointer may not be where we expect it to be, but the zone
782	 * append write can still succeed. So check manually that we wrote where
783	 * we intended to, that is, at zi->i_wpoffset.
784	 */
785	if (!ret) {
786		sector_t wpsector =
787			zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT);
788
789		if (bio->bi_iter.bi_sector != wpsector) {
790			zonefs_warn(inode->i_sb,
791				"Corrupted write pointer %llu for zone at %llu\n",
792				bio->bi_iter.bi_sector, zi->i_zsector);
793			ret = -EIO;
794		}
795	}
796
797	zonefs_file_write_dio_end_io(iocb, size, ret, 0);
798
799out_release:
800	bio_release_pages(bio, false);
801	bio_put(bio);
802
803	if (ret >= 0) {
804		iocb->ki_pos += size;
805		return size;
806	}
807
808	return ret;
809}
810
811/*
812 * Do not exceed the LFS limits nor the file zone size. If pos is under the
813 * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
814 */
815static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
816					loff_t count)
817{
818	struct inode *inode = file_inode(file);
819	struct zonefs_inode_info *zi = ZONEFS_I(inode);
820	loff_t limit = rlimit(RLIMIT_FSIZE);
821	loff_t max_size = zi->i_max_size;
822
823	if (limit != RLIM_INFINITY) {
824		if (pos >= limit) {
825			send_sig(SIGXFSZ, current, 0);
826			return -EFBIG;
827		}
828		count = min(count, limit - pos);
829	}
830
831	if (!(file->f_flags & O_LARGEFILE))
832		max_size = min_t(loff_t, MAX_NON_LFS, max_size);
833
834	if (unlikely(pos >= max_size))
835		return -EFBIG;
836
837	return min(count, max_size - pos);
838}
839
840static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
841{
842	struct file *file = iocb->ki_filp;
843	struct inode *inode = file_inode(file);
844	struct zonefs_inode_info *zi = ZONEFS_I(inode);
845	loff_t count;
846
847	if (IS_SWAPFILE(inode))
848		return -ETXTBSY;
849
850	if (!iov_iter_count(from))
851		return 0;
852
853	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
854		return -EINVAL;
855
856	if (iocb->ki_flags & IOCB_APPEND) {
857		if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
858			return -EINVAL;
859		mutex_lock(&zi->i_truncate_mutex);
860		iocb->ki_pos = zi->i_wpoffset;
861		mutex_unlock(&zi->i_truncate_mutex);
862	}
863
864	count = zonefs_write_check_limits(file, iocb->ki_pos,
865					  iov_iter_count(from));
866	if (count < 0)
867		return count;
868
869	iov_iter_truncate(from, count);
870	return iov_iter_count(from);
871}
872
873/*
874 * Handle direct writes. For sequential zone files, this is the only possible
875 * write path. For these files, check that the user is issuing writes
876 * sequentially from the end of the file. This code assumes that the block layer
877 * delivers write requests to the device in sequential order. This is always the
878 * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
879 * elevator feature is being used (e.g. mq-deadline). The block layer always
880 * automatically select such an elevator for zoned block devices during the
881 * device initialization.
882 */
883static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
884{
885	struct inode *inode = file_inode(iocb->ki_filp);
886	struct zonefs_inode_info *zi = ZONEFS_I(inode);
887	struct super_block *sb = inode->i_sb;
888	bool sync = is_sync_kiocb(iocb);
889	bool append = false;
890	ssize_t ret, count;
891
892	/*
893	 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
894	 * as this can cause write reordering (e.g. the first aio gets EAGAIN
895	 * on the inode lock but the second goes through but is now unaligned).
896	 */
897	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync &&
898	    (iocb->ki_flags & IOCB_NOWAIT))
899		return -EOPNOTSUPP;
900
901	if (iocb->ki_flags & IOCB_NOWAIT) {
902		if (!inode_trylock(inode))
903			return -EAGAIN;
904	} else {
905		inode_lock(inode);
906	}
907
908	count = zonefs_write_checks(iocb, from);
909	if (count <= 0) {
910		ret = count;
911		goto inode_unlock;
912	}
913
914	if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
915		ret = -EINVAL;
916		goto inode_unlock;
917	}
918
919	/* Enforce sequential writes (append only) in sequential zones */
920	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
921		mutex_lock(&zi->i_truncate_mutex);
922		if (iocb->ki_pos != zi->i_wpoffset) {
923			mutex_unlock(&zi->i_truncate_mutex);
924			ret = -EINVAL;
925			goto inode_unlock;
926		}
927		mutex_unlock(&zi->i_truncate_mutex);
928		append = sync;
929	}
930
931	if (append)
932		ret = zonefs_file_dio_append(iocb, from);
933	else
934		ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
935				   &zonefs_write_dio_ops, sync);
936	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
937	    (ret > 0 || ret == -EIOCBQUEUED)) {
938		if (ret > 0)
939			count = ret;
940		mutex_lock(&zi->i_truncate_mutex);
941		zi->i_wpoffset += count;
942		mutex_unlock(&zi->i_truncate_mutex);
943	}
944
945inode_unlock:
946	inode_unlock(inode);
947
948	return ret;
949}
950
951static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
952					  struct iov_iter *from)
953{
954	struct inode *inode = file_inode(iocb->ki_filp);
955	struct zonefs_inode_info *zi = ZONEFS_I(inode);
956	ssize_t ret;
957
958	/*
959	 * Direct IO writes are mandatory for sequential zone files so that the
960	 * write IO issuing order is preserved.
961	 */
962	if (zi->i_ztype != ZONEFS_ZTYPE_CNV)
963		return -EIO;
964
965	if (iocb->ki_flags & IOCB_NOWAIT) {
966		if (!inode_trylock(inode))
967			return -EAGAIN;
968	} else {
969		inode_lock(inode);
970	}
971
972	ret = zonefs_write_checks(iocb, from);
973	if (ret <= 0)
974		goto inode_unlock;
975
976	ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
977	if (ret > 0)
978		iocb->ki_pos += ret;
979	else if (ret == -EIO)
980		zonefs_io_error(inode, true);
981
982inode_unlock:
983	inode_unlock(inode);
984	if (ret > 0)
985		ret = generic_write_sync(iocb, ret);
986
987	return ret;
988}
989
990static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
991{
992	struct inode *inode = file_inode(iocb->ki_filp);
993
994	if (unlikely(IS_IMMUTABLE(inode)))
995		return -EPERM;
996
997	if (sb_rdonly(inode->i_sb))
998		return -EROFS;
999
1000	/* Write operations beyond the zone size are not allowed */
1001	if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size)
1002		return -EFBIG;
1003
1004	if (iocb->ki_flags & IOCB_DIRECT) {
1005		ssize_t ret = zonefs_file_dio_write(iocb, from);
1006		if (ret != -ENOTBLK)
1007			return ret;
1008	}
1009
1010	return zonefs_file_buffered_write(iocb, from);
1011}
1012
1013static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
1014				       int error, unsigned int flags)
1015{
1016	if (error) {
1017		zonefs_io_error(file_inode(iocb->ki_filp), false);
1018		return error;
1019	}
1020
1021	return 0;
1022}
1023
1024static const struct iomap_dio_ops zonefs_read_dio_ops = {
1025	.end_io			= zonefs_file_read_dio_end_io,
1026};
1027
1028static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1029{
1030	struct inode *inode = file_inode(iocb->ki_filp);
1031	struct zonefs_inode_info *zi = ZONEFS_I(inode);
1032	struct super_block *sb = inode->i_sb;
1033	loff_t isize;
1034	ssize_t ret;
1035
1036	/* Offline zones cannot be read */
1037	if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
1038		return -EPERM;
1039
1040	if (iocb->ki_pos >= zi->i_max_size)
1041		return 0;
1042
1043	if (iocb->ki_flags & IOCB_NOWAIT) {
1044		if (!inode_trylock_shared(inode))
1045			return -EAGAIN;
1046	} else {
1047		inode_lock_shared(inode);
1048	}
1049
1050	/* Limit read operations to written data */
1051	mutex_lock(&zi->i_truncate_mutex);
1052	isize = i_size_read(inode);
1053	if (iocb->ki_pos >= isize) {
1054		mutex_unlock(&zi->i_truncate_mutex);
1055		ret = 0;
1056		goto inode_unlock;
1057	}
1058	iov_iter_truncate(to, isize - iocb->ki_pos);
1059	mutex_unlock(&zi->i_truncate_mutex);
1060
1061	if (iocb->ki_flags & IOCB_DIRECT) {
1062		size_t count = iov_iter_count(to);
1063
1064		if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
1065			ret = -EINVAL;
1066			goto inode_unlock;
1067		}
1068		file_accessed(iocb->ki_filp);
1069		ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
1070				   &zonefs_read_dio_ops, is_sync_kiocb(iocb));
1071	} else {
1072		ret = generic_file_read_iter(iocb, to);
1073		if (ret == -EIO)
1074			zonefs_io_error(inode, false);
1075	}
1076
1077inode_unlock:
1078	inode_unlock_shared(inode);
1079
1080	return ret;
1081}
1082
1083static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file)
1084{
1085	struct zonefs_inode_info *zi = ZONEFS_I(inode);
1086	struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
1087
1088	if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
1089		return false;
1090
1091	if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
1092		return false;
1093
1094	if (!(file->f_mode & FMODE_WRITE))
1095		return false;
1096
1097	return true;
1098}
1099
1100static int zonefs_open_zone(struct inode *inode)
1101{
1102	struct zonefs_inode_info *zi = ZONEFS_I(inode);
1103	struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
1104	int ret = 0;
1105
1106	mutex_lock(&zi->i_truncate_mutex);
1107
1108	if (!zi->i_wr_refcnt) {
1109		if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
1110			atomic_dec(&sbi->s_open_zones);
1111			ret = -EBUSY;
1112			goto unlock;
1113		}
1114
1115		if (i_size_read(inode) < zi->i_max_size) {
1116			ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
1117			if (ret) {
1118				atomic_dec(&sbi->s_open_zones);
1119				goto unlock;
1120			}
1121			zi->i_flags |= ZONEFS_ZONE_OPEN;
1122		}
1123	}
1124
1125	zi->i_wr_refcnt++;
1126
1127unlock:
1128	mutex_unlock(&zi->i_truncate_mutex);
1129
1130	return ret;
1131}
1132
1133static int zonefs_file_open(struct inode *inode, struct file *file)
1134{
1135	int ret;
1136
1137	ret = generic_file_open(inode, file);
1138	if (ret)
1139		return ret;
1140
1141	if (zonefs_file_use_exp_open(inode, file))
1142		return zonefs_open_zone(inode);
1143
1144	return 0;
1145}
1146
1147static void zonefs_close_zone(struct inode *inode)
1148{
1149	struct zonefs_inode_info *zi = ZONEFS_I(inode);
1150	int ret = 0;
1151
1152	mutex_lock(&zi->i_truncate_mutex);
1153	zi->i_wr_refcnt--;
1154	if (!zi->i_wr_refcnt) {
1155		struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
1156		struct super_block *sb = inode->i_sb;
1157
1158		/*
1159		 * If the file zone is full, it is not open anymore and we only
1160		 * need to decrement the open count.
1161		 */
1162		if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
1163			goto dec;
1164
1165		ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
1166		if (ret) {
1167			__zonefs_io_error(inode, false);
1168			/*
1169			 * Leaving zones explicitly open may lead to a state
1170			 * where most zones cannot be written (zone resources
1171			 * exhausted). So take preventive action by remounting
1172			 * read-only.
1173			 */
1174			if (zi->i_flags & ZONEFS_ZONE_OPEN &&
1175			    !(sb->s_flags & SB_RDONLY)) {
1176				zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
1177				sb->s_flags |= SB_RDONLY;
1178			}
1179		}
1180		zi->i_flags &= ~ZONEFS_ZONE_OPEN;
1181dec:
1182		atomic_dec(&sbi->s_open_zones);
1183	}
1184	mutex_unlock(&zi->i_truncate_mutex);
1185}
1186
1187static int zonefs_file_release(struct inode *inode, struct file *file)
1188{
1189	/*
1190	 * If we explicitly open a zone we must close it again as well, but the
1191	 * zone management operation can fail (either due to an IO error or as
1192	 * the zone has gone offline or read-only). Make sure we don't fail the
1193	 * close(2) for user-space.
1194	 */
1195	if (zonefs_file_use_exp_open(inode, file))
1196		zonefs_close_zone(inode);
1197
1198	return 0;
1199}
1200
1201static const struct file_operations zonefs_file_operations = {
1202	.open		= zonefs_file_open,
1203	.release	= zonefs_file_release,
1204	.fsync		= zonefs_file_fsync,
1205	.mmap		= zonefs_file_mmap,
1206	.llseek		= zonefs_file_llseek,
1207	.read_iter	= zonefs_file_read_iter,
1208	.write_iter	= zonefs_file_write_iter,
1209	.splice_read	= generic_file_splice_read,
1210	.splice_write	= iter_file_splice_write,
1211	.iopoll		= iomap_dio_iopoll,
1212};
1213
1214static struct kmem_cache *zonefs_inode_cachep;
1215
1216static struct inode *zonefs_alloc_inode(struct super_block *sb)
1217{
1218	struct zonefs_inode_info *zi;
1219
1220	zi = kmem_cache_alloc(zonefs_inode_cachep, GFP_KERNEL);
1221	if (!zi)
1222		return NULL;
1223
1224	inode_init_once(&zi->i_vnode);
1225	mutex_init(&zi->i_truncate_mutex);
1226	init_rwsem(&zi->i_mmap_sem);
1227	zi->i_wr_refcnt = 0;
1228	zi->i_flags = 0;
1229
1230	return &zi->i_vnode;
1231}
1232
1233static void zonefs_free_inode(struct inode *inode)
1234{
1235	kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode));
1236}
1237
1238/*
1239 * File system stat.
1240 */
1241static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
1242{
1243	struct super_block *sb = dentry->d_sb;
1244	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
1245	enum zonefs_ztype t;
1246	u64 fsid;
1247
1248	buf->f_type = ZONEFS_MAGIC;
1249	buf->f_bsize = sb->s_blocksize;
1250	buf->f_namelen = ZONEFS_NAME_MAX;
1251
1252	spin_lock(&sbi->s_lock);
1253
1254	buf->f_blocks = sbi->s_blocks;
1255	if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks))
1256		buf->f_bfree = 0;
1257	else
1258		buf->f_bfree = buf->f_blocks - sbi->s_used_blocks;
1259	buf->f_bavail = buf->f_bfree;
1260
1261	for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
1262		if (sbi->s_nr_files[t])
1263			buf->f_files += sbi->s_nr_files[t] + 1;
1264	}
1265	buf->f_ffree = 0;
1266
1267	spin_unlock(&sbi->s_lock);
1268
1269	fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^
1270		le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64));
1271	buf->f_fsid = u64_to_fsid(fsid);
1272
1273	return 0;
1274}
1275
1276enum {
1277	Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
1278	Opt_explicit_open, Opt_err,
1279};
1280
1281static const match_table_t tokens = {
1282	{ Opt_errors_ro,	"errors=remount-ro"},
1283	{ Opt_errors_zro,	"errors=zone-ro"},
1284	{ Opt_errors_zol,	"errors=zone-offline"},
1285	{ Opt_errors_repair,	"errors=repair"},
1286	{ Opt_explicit_open,	"explicit-open" },
1287	{ Opt_err,		NULL}
1288};
1289
1290static int zonefs_parse_options(struct super_block *sb, char *options)
1291{
1292	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
1293	substring_t args[MAX_OPT_ARGS];
1294	char *p;
1295
1296	if (!options)
1297		return 0;
1298
1299	while ((p = strsep(&options, ",")) != NULL) {
1300		int token;
1301
1302		if (!*p)
1303			continue;
1304
1305		token = match_token(p, tokens, args);
1306		switch (token) {
1307		case Opt_errors_ro:
1308			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
1309			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO;
1310			break;
1311		case Opt_errors_zro:
1312			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
1313			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO;
1314			break;
1315		case Opt_errors_zol:
1316			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
1317			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL;
1318			break;
1319		case Opt_errors_repair:
1320			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
1321			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR;
1322			break;
1323		case Opt_explicit_open:
1324			sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
1325			break;
1326		default:
1327			return -EINVAL;
1328		}
1329	}
1330
1331	return 0;
1332}
1333
1334static int zonefs_show_options(struct seq_file *seq, struct dentry *root)
1335{
1336	struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb);
1337
1338	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO)
1339		seq_puts(seq, ",errors=remount-ro");
1340	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)
1341		seq_puts(seq, ",errors=zone-ro");
1342	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)
1343		seq_puts(seq, ",errors=zone-offline");
1344	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR)
1345		seq_puts(seq, ",errors=repair");
1346
1347	return 0;
1348}
1349
1350static int zonefs_remount(struct super_block *sb, int *flags, char *data)
1351{
1352	sync_filesystem(sb);
1353
1354	return zonefs_parse_options(sb, data);
1355}
1356
1357static const struct super_operations zonefs_sops = {
1358	.alloc_inode	= zonefs_alloc_inode,
1359	.free_inode	= zonefs_free_inode,
1360	.statfs		= zonefs_statfs,
1361	.remount_fs	= zonefs_remount,
1362	.show_options	= zonefs_show_options,
1363};
1364
1365static const struct inode_operations zonefs_dir_inode_operations = {
1366	.lookup		= simple_lookup,
1367	.setattr	= zonefs_inode_setattr,
1368};
1369
1370static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
1371				  enum zonefs_ztype type)
1372{
1373	struct super_block *sb = parent->i_sb;
1374
1375	inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1;
1376	inode_init_owner(inode, parent, S_IFDIR | 0555);
1377	inode->i_op = &zonefs_dir_inode_operations;
1378	inode->i_fop = &simple_dir_operations;
1379	set_nlink(inode, 2);
1380	inc_nlink(parent);
1381}
1382
1383static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
1384				  enum zonefs_ztype type)
1385{
1386	struct super_block *sb = inode->i_sb;
1387	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
1388	struct zonefs_inode_info *zi = ZONEFS_I(inode);
1389	int ret = 0;
1390
1391	inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
1392	inode->i_mode = S_IFREG | sbi->s_perm;
1393
1394	zi->i_ztype = type;
1395	zi->i_zsector = zone->start;
1396	zi->i_zone_size = zone->len << SECTOR_SHIFT;
1397	if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT &&
1398	    !(sbi->s_features & ZONEFS_F_AGGRCNV)) {
1399		zonefs_err(sb,
1400			   "zone size %llu doesn't match device's zone sectors %llu\n",
1401			   zi->i_zone_size,
1402			   bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT);
1403		return -EINVAL;
1404	}
1405
1406	zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
1407			       zone->capacity << SECTOR_SHIFT);
1408	zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true);
1409
1410	inode->i_uid = sbi->s_uid;
1411	inode->i_gid = sbi->s_gid;
1412	inode->i_size = zi->i_wpoffset;
1413	inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT;
1414
1415	inode->i_op = &zonefs_file_inode_operations;
1416	inode->i_fop = &zonefs_file_operations;
1417	inode->i_mapping->a_ops = &zonefs_file_aops;
1418
1419	sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
1420	sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
1421	sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
1422
1423	/*
1424	 * For sequential zones, make sure that any open zone is closed first
1425	 * to ensure that the initial number of open zones is 0, in sync with
1426	 * the open zone accounting done when the mount option
1427	 * ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
1428	 */
1429	if (type == ZONEFS_ZTYPE_SEQ &&
1430	    (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
1431	     zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
1432		mutex_lock(&zi->i_truncate_mutex);
1433		ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
1434		mutex_unlock(&zi->i_truncate_mutex);
1435	}
1436
1437	return ret;
1438}
1439
1440static struct dentry *zonefs_create_inode(struct dentry *parent,
1441					const char *name, struct blk_zone *zone,
1442					enum zonefs_ztype type)
1443{
1444	struct inode *dir = d_inode(parent);
1445	struct dentry *dentry;
1446	struct inode *inode;
1447	int ret = -ENOMEM;
1448
1449	dentry = d_alloc_name(parent, name);
1450	if (!dentry)
1451		return ERR_PTR(ret);
1452
1453	inode = new_inode(parent->d_sb);
1454	if (!inode)
1455		goto dput;
1456
1457	inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
1458	if (zone) {
1459		ret = zonefs_init_file_inode(inode, zone, type);
1460		if (ret) {
1461			iput(inode);
1462			goto dput;
1463		}
1464	} else {
1465		zonefs_init_dir_inode(dir, inode, type);
1466	}
1467
1468	d_add(dentry, inode);
1469	dir->i_size++;
1470
1471	return dentry;
1472
1473dput:
1474	dput(dentry);
1475
1476	return ERR_PTR(ret);
1477}
1478
1479struct zonefs_zone_data {
1480	struct super_block	*sb;
1481	unsigned int		nr_zones[ZONEFS_ZTYPE_MAX];
1482	struct blk_zone		*zones;
1483};
1484
1485/*
1486 * Create a zone group and populate it with zone files.
1487 */
1488static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
1489				enum zonefs_ztype type)
1490{
1491	struct super_block *sb = zd->sb;
1492	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
1493	struct blk_zone *zone, *next, *end;
1494	const char *zgroup_name;
1495	char *file_name;
1496	struct dentry *dir, *dent;
1497	unsigned int n = 0;
1498	int ret;
1499
1500	/* If the group is empty, there is nothing to do */
1501	if (!zd->nr_zones[type])
1502		return 0;
1503
1504	file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL);
1505	if (!file_name)
1506		return -ENOMEM;
1507
1508	if (type == ZONEFS_ZTYPE_CNV)
1509		zgroup_name = "cnv";
1510	else
1511		zgroup_name = "seq";
1512
1513	dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type);
1514	if (IS_ERR(dir)) {
1515		ret = PTR_ERR(dir);
1516		goto free;
1517	}
1518
1519	/*
1520	 * The first zone contains the super block: skip it.
1521	 */
1522	end = zd->zones + blkdev_nr_zones(sb->s_bdev->bd_disk);
1523	for (zone = &zd->zones[1]; zone < end; zone = next) {
1524
1525		next = zone + 1;
1526		if (zonefs_zone_type(zone) != type)
1527			continue;
1528
1529		/*
1530		 * For conventional zones, contiguous zones can be aggregated
1531		 * together to form larger files. Note that this overwrites the
1532		 * length of the first zone of the set of contiguous zones
1533		 * aggregated together. If one offline or read-only zone is
1534		 * found, assume that all zones aggregated have the same
1535		 * condition.
1536		 */
1537		if (type == ZONEFS_ZTYPE_CNV &&
1538		    (sbi->s_features & ZONEFS_F_AGGRCNV)) {
1539			for (; next < end; next++) {
1540				if (zonefs_zone_type(next) != type)
1541					break;
1542				zone->len += next->len;
1543				zone->capacity += next->capacity;
1544				if (next->cond == BLK_ZONE_COND_READONLY &&
1545				    zone->cond != BLK_ZONE_COND_OFFLINE)
1546					zone->cond = BLK_ZONE_COND_READONLY;
1547				else if (next->cond == BLK_ZONE_COND_OFFLINE)
1548					zone->cond = BLK_ZONE_COND_OFFLINE;
1549			}
1550			if (zone->capacity != zone->len) {
1551				zonefs_err(sb, "Invalid conventional zone capacity\n");
1552				ret = -EINVAL;
1553				goto free;
1554			}
1555		}
1556
1557		/*
1558		 * Use the file number within its group as file name.
1559		 */
1560		snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n);
1561		dent = zonefs_create_inode(dir, file_name, zone, type);
1562		if (IS_ERR(dent)) {
1563			ret = PTR_ERR(dent);
1564			goto free;
1565		}
1566
1567		n++;
1568	}
1569
1570	zonefs_info(sb, "Zone group \"%s\" has %u file%s\n",
1571		    zgroup_name, n, n > 1 ? "s" : "");
1572
1573	sbi->s_nr_files[type] = n;
1574	ret = 0;
1575
1576free:
1577	kfree(file_name);
1578
1579	return ret;
1580}
1581
1582static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx,
1583				   void *data)
1584{
1585	struct zonefs_zone_data *zd = data;
1586
1587	/*
1588	 * Count the number of usable zones: the first zone at index 0 contains
1589	 * the super block and is ignored.
1590	 */
1591	switch (zone->type) {
1592	case BLK_ZONE_TYPE_CONVENTIONAL:
1593		zone->wp = zone->start + zone->len;
1594		if (idx)
1595			zd->nr_zones[ZONEFS_ZTYPE_CNV]++;
1596		break;
1597	case BLK_ZONE_TYPE_SEQWRITE_REQ:
1598	case BLK_ZONE_TYPE_SEQWRITE_PREF:
1599		if (idx)
1600			zd->nr_zones[ZONEFS_ZTYPE_SEQ]++;
1601		break;
1602	default:
1603		zonefs_err(zd->sb, "Unsupported zone type 0x%x\n",
1604			   zone->type);
1605		return -EIO;
1606	}
1607
1608	memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone));
1609
1610	return 0;
1611}
1612
1613static int zonefs_get_zone_info(struct zonefs_zone_data *zd)
1614{
1615	struct block_device *bdev = zd->sb->s_bdev;
1616	int ret;
1617
1618	zd->zones = kvcalloc(blkdev_nr_zones(bdev->bd_disk),
1619			     sizeof(struct blk_zone), GFP_KERNEL);
1620	if (!zd->zones)
1621		return -ENOMEM;
1622
1623	/* Get zones information from the device */
1624	ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES,
1625				  zonefs_get_zone_info_cb, zd);
1626	if (ret < 0) {
1627		zonefs_err(zd->sb, "Zone report failed %d\n", ret);
1628		return ret;
1629	}
1630
1631	if (ret != blkdev_nr_zones(bdev->bd_disk)) {
1632		zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n",
1633			   ret, blkdev_nr_zones(bdev->bd_disk));
1634		return -EIO;
1635	}
1636
1637	return 0;
1638}
1639
1640static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd)
1641{
1642	kvfree(zd->zones);
1643}
1644
1645/*
1646 * Read super block information from the device.
1647 */
1648static int zonefs_read_super(struct super_block *sb)
1649{
1650	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
1651	struct zonefs_super *super;
1652	u32 crc, stored_crc;
1653	struct page *page;
1654	struct bio_vec bio_vec;
1655	struct bio bio;
1656	int ret;
1657
1658	page = alloc_page(GFP_KERNEL);
1659	if (!page)
1660		return -ENOMEM;
1661
1662	bio_init(&bio, &bio_vec, 1);
1663	bio.bi_iter.bi_sector = 0;
1664	bio.bi_opf = REQ_OP_READ;
1665	bio_set_dev(&bio, sb->s_bdev);
1666	bio_add_page(&bio, page, PAGE_SIZE, 0);
1667
1668	ret = submit_bio_wait(&bio);
1669	if (ret)
1670		goto free_page;
1671
1672	super = kmap(page);
1673
1674	ret = -EINVAL;
1675	if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
1676		goto unmap;
1677
1678	stored_crc = le32_to_cpu(super->s_crc);
1679	super->s_crc = 0;
1680	crc = crc32(~0U, (unsigned char *)super, sizeof(struct zonefs_super));
1681	if (crc != stored_crc) {
1682		zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)",
1683			   crc, stored_crc);
1684		goto unmap;
1685	}
1686
1687	sbi->s_features = le64_to_cpu(super->s_features);
1688	if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) {
1689		zonefs_err(sb, "Unknown features set 0x%llx\n",
1690			   sbi->s_features);
1691		goto unmap;
1692	}
1693
1694	if (sbi->s_features & ZONEFS_F_UID) {
1695		sbi->s_uid = make_kuid(current_user_ns(),
1696				       le32_to_cpu(super->s_uid));
1697		if (!uid_valid(sbi->s_uid)) {
1698			zonefs_err(sb, "Invalid UID feature\n");
1699			goto unmap;
1700		}
1701	}
1702
1703	if (sbi->s_features & ZONEFS_F_GID) {
1704		sbi->s_gid = make_kgid(current_user_ns(),
1705				       le32_to_cpu(super->s_gid));
1706		if (!gid_valid(sbi->s_gid)) {
1707			zonefs_err(sb, "Invalid GID feature\n");
1708			goto unmap;
1709		}
1710	}
1711
1712	if (sbi->s_features & ZONEFS_F_PERM)
1713		sbi->s_perm = le32_to_cpu(super->s_perm);
1714
1715	if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) {
1716		zonefs_err(sb, "Reserved area is being used\n");
1717		goto unmap;
1718	}
1719
1720	import_uuid(&sbi->s_uuid, super->s_uuid);
1721	ret = 0;
1722
1723unmap:
1724	kunmap(page);
1725free_page:
1726	__free_page(page);
1727
1728	return ret;
1729}
1730
1731/*
1732 * Check that the device is zoned. If it is, get the list of zones and create
1733 * sub-directories and files according to the device zone configuration and
1734 * format options.
1735 */
1736static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
1737{
1738	struct zonefs_zone_data zd;
1739	struct zonefs_sb_info *sbi;
1740	struct inode *inode;
1741	enum zonefs_ztype t;
1742	int ret;
1743
1744	if (!bdev_is_zoned(sb->s_bdev)) {
1745		zonefs_err(sb, "Not a zoned block device\n");
1746		return -EINVAL;
1747	}
1748
1749	/*
1750	 * Initialize super block information: the maximum file size is updated
1751	 * when the zone files are created so that the format option
1752	 * ZONEFS_F_AGGRCNV which increases the maximum file size of a file
1753	 * beyond the zone size is taken into account.
1754	 */
1755	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1756	if (!sbi)
1757		return -ENOMEM;
1758
1759	spin_lock_init(&sbi->s_lock);
1760	sb->s_fs_info = sbi;
1761	sb->s_magic = ZONEFS_MAGIC;
1762	sb->s_maxbytes = 0;
1763	sb->s_op = &zonefs_sops;
1764	sb->s_time_gran	= 1;
1765
1766	/*
1767	 * The block size is set to the device physical sector size to ensure
1768	 * that write operations on 512e devices (512B logical block and 4KB
1769	 * physical block) are always aligned to the device physical blocks,
1770	 * as mandated by the ZBC/ZAC specifications.
1771	 */
1772	sb_set_blocksize(sb, bdev_physical_block_size(sb->s_bdev));
1773	sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev));
1774	sbi->s_uid = GLOBAL_ROOT_UID;
1775	sbi->s_gid = GLOBAL_ROOT_GID;
1776	sbi->s_perm = 0640;
1777	sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
1778	sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
1779	atomic_set(&sbi->s_open_zones, 0);
1780
1781	ret = zonefs_read_super(sb);
1782	if (ret)
1783		return ret;
1784
1785	ret = zonefs_parse_options(sb, data);
1786	if (ret)
1787		return ret;
1788
1789	memset(&zd, 0, sizeof(struct zonefs_zone_data));
1790	zd.sb = sb;
1791	ret = zonefs_get_zone_info(&zd);
1792	if (ret)
1793		goto cleanup;
1794
1795	zonefs_info(sb, "Mounting %u zones",
1796		    blkdev_nr_zones(sb->s_bdev->bd_disk));
1797
1798	if (!sbi->s_max_open_zones &&
1799	    sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
1800		zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
1801		sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
1802	}
1803
1804	/* Create root directory inode */
1805	ret = -ENOMEM;
1806	inode = new_inode(sb);
1807	if (!inode)
1808		goto cleanup;
1809
1810	inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk);
1811	inode->i_mode = S_IFDIR | 0555;
1812	inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode);
1813	inode->i_op = &zonefs_dir_inode_operations;
1814	inode->i_fop = &simple_dir_operations;
1815	set_nlink(inode, 2);
1816
1817	sb->s_root = d_make_root(inode);
1818	if (!sb->s_root)
1819		goto cleanup;
1820
1821	/* Create and populate files in zone groups directories */
1822	for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
1823		ret = zonefs_create_zgroup(&zd, t);
1824		if (ret)
1825			break;
1826	}
1827
1828cleanup:
1829	zonefs_cleanup_zone_info(&zd);
1830
1831	return ret;
1832}
1833
1834static struct dentry *zonefs_mount(struct file_system_type *fs_type,
1835				   int flags, const char *dev_name, void *data)
1836{
1837	return mount_bdev(fs_type, flags, dev_name, data, zonefs_fill_super);
1838}
1839
1840static void zonefs_kill_super(struct super_block *sb)
1841{
1842	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
1843
1844	if (sb->s_root)
1845		d_genocide(sb->s_root);
1846	kill_block_super(sb);
1847	kfree(sbi);
1848}
1849
1850/*
1851 * File system definition and registration.
1852 */
1853static struct file_system_type zonefs_type = {
1854	.owner		= THIS_MODULE,
1855	.name		= "zonefs",
1856	.mount		= zonefs_mount,
1857	.kill_sb	= zonefs_kill_super,
1858	.fs_flags	= FS_REQUIRES_DEV,
1859};
1860
1861static int __init zonefs_init_inodecache(void)
1862{
1863	zonefs_inode_cachep = kmem_cache_create("zonefs_inode_cache",
1864			sizeof(struct zonefs_inode_info), 0,
1865			(SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT),
1866			NULL);
1867	if (zonefs_inode_cachep == NULL)
1868		return -ENOMEM;
1869	return 0;
1870}
1871
1872static void zonefs_destroy_inodecache(void)
1873{
1874	/*
1875	 * Make sure all delayed rcu free inodes are flushed before we
1876	 * destroy the inode cache.
1877	 */
1878	rcu_barrier();
1879	kmem_cache_destroy(zonefs_inode_cachep);
1880}
1881
1882static int __init zonefs_init(void)
1883{
1884	int ret;
1885
1886	BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE);
1887
1888	ret = zonefs_init_inodecache();
1889	if (ret)
1890		return ret;
1891
1892	ret = register_filesystem(&zonefs_type);
1893	if (ret) {
1894		zonefs_destroy_inodecache();
1895		return ret;
1896	}
1897
1898	return 0;
1899}
1900
1901static void __exit zonefs_exit(void)
1902{
1903	zonefs_destroy_inodecache();
1904	unregister_filesystem(&zonefs_type);
1905}
1906
1907MODULE_AUTHOR("Damien Le Moal");
1908MODULE_DESCRIPTION("Zone file system for zoned block devices");
1909MODULE_LICENSE("GPL");
1910MODULE_ALIAS_FS("zonefs");
1911module_init(zonefs_init);
1912module_exit(zonefs_exit);
1913