1// SPDX-License-Identifier: GPL-2.0
2#include <linux/fanotify.h>
3#include <linux/fcntl.h>
4#include <linux/file.h>
5#include <linux/fs.h>
6#include <linux/anon_inodes.h>
7#include <linux/fsnotify_backend.h>
8#include <linux/init.h>
9#include <linux/mount.h>
10#include <linux/namei.h>
11#include <linux/poll.h>
12#include <linux/security.h>
13#include <linux/syscalls.h>
14#include <linux/slab.h>
15#include <linux/types.h>
16#include <linux/uaccess.h>
17#include <linux/compat.h>
18#include <linux/sched/signal.h>
19#include <linux/memcontrol.h>
20#include <linux/statfs.h>
21#include <linux/exportfs.h>
22
23#include <asm/ioctls.h>
24
25#include "../../mount.h"
26#include "../fdinfo.h"
27#include "fanotify.h"
28
29#define FANOTIFY_DEFAULT_MAX_EVENTS	16384
30#define FANOTIFY_DEFAULT_MAX_MARKS	8192
31#define FANOTIFY_DEFAULT_MAX_LISTENERS	128
32
33/*
34 * All flags that may be specified in parameter event_f_flags of fanotify_init.
35 *
36 * Internal and external open flags are stored together in field f_flags of
37 * struct file. Only external open flags shall be allowed in event_f_flags.
38 * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
39 * excluded.
40 */
41#define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
42		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
43		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
44		O_LARGEFILE	| O_NOATIME	)
45
46extern const struct fsnotify_ops fanotify_fsnotify_ops;
47
48struct kmem_cache *fanotify_mark_cache __read_mostly;
49struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
50struct kmem_cache *fanotify_path_event_cachep __read_mostly;
51struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
52
53#define FANOTIFY_EVENT_ALIGN 4
54#define FANOTIFY_INFO_HDR_LEN \
55	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
56
57static int fanotify_fid_info_len(int fh_len, int name_len)
58{
59	int info_len = fh_len;
60
61	if (name_len)
62		info_len += name_len + 1;
63
64	return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN);
65}
66
67static int fanotify_event_info_len(unsigned int fid_mode,
68				   struct fanotify_event *event)
69{
70	struct fanotify_info *info = fanotify_event_info(event);
71	int dir_fh_len = fanotify_event_dir_fh_len(event);
72	int fh_len = fanotify_event_object_fh_len(event);
73	int info_len = 0;
74	int dot_len = 0;
75
76	if (dir_fh_len) {
77		info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
78	} else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) {
79		/*
80		 * With group flag FAN_REPORT_NAME, if name was not recorded in
81		 * event on a directory, we will report the name ".".
82		 */
83		dot_len = 1;
84	}
85
86	if (fh_len)
87		info_len += fanotify_fid_info_len(fh_len, dot_len);
88
89	return info_len;
90}
91
92/*
93 * Get an fanotify notification event if one exists and is small
94 * enough to fit in "count". Return an error pointer if the count
95 * is not large enough. When permission event is dequeued, its state is
96 * updated accordingly.
97 */
98static struct fanotify_event *get_one_event(struct fsnotify_group *group,
99					    size_t count)
100{
101	size_t event_size = FAN_EVENT_METADATA_LEN;
102	struct fanotify_event *event = NULL;
103	unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
104
105	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
106
107	spin_lock(&group->notification_lock);
108	if (fsnotify_notify_queue_is_empty(group))
109		goto out;
110
111	if (fid_mode) {
112		event_size += fanotify_event_info_len(fid_mode,
113			FANOTIFY_E(fsnotify_peek_first_event(group)));
114	}
115
116	if (event_size > count) {
117		event = ERR_PTR(-EINVAL);
118		goto out;
119	}
120	event = FANOTIFY_E(fsnotify_remove_first_event(group));
121	if (fanotify_is_perm_event(event->mask))
122		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
123out:
124	spin_unlock(&group->notification_lock);
125	return event;
126}
127
128static int create_fd(struct fsnotify_group *group, struct path *path,
129		     struct file **file)
130{
131	int client_fd;
132	struct file *new_file;
133
134	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
135	if (client_fd < 0)
136		return client_fd;
137
138	/*
139	 * we need a new file handle for the userspace program so it can read even if it was
140	 * originally opened O_WRONLY.
141	 */
142	new_file = dentry_open(path,
143			       group->fanotify_data.f_flags | FMODE_NONOTIFY,
144			       current_cred());
145	if (IS_ERR(new_file)) {
146		/*
147		 * we still send an event even if we can't open the file.  this
148		 * can happen when say tasks are gone and we try to open their
149		 * /proc files or we try to open a WRONLY file like in sysfs
150		 * we just send the errno to userspace since there isn't much
151		 * else we can do.
152		 */
153		put_unused_fd(client_fd);
154		client_fd = PTR_ERR(new_file);
155	} else {
156		*file = new_file;
157	}
158
159	return client_fd;
160}
161
162/*
163 * Finish processing of permission event by setting it to ANSWERED state and
164 * drop group->notification_lock.
165 */
166static void finish_permission_event(struct fsnotify_group *group,
167				    struct fanotify_perm_event *event,
168				    unsigned int response)
169				    __releases(&group->notification_lock)
170{
171	bool destroy = false;
172
173	assert_spin_locked(&group->notification_lock);
174	event->response = response;
175	if (event->state == FAN_EVENT_CANCELED)
176		destroy = true;
177	else
178		event->state = FAN_EVENT_ANSWERED;
179	spin_unlock(&group->notification_lock);
180	if (destroy)
181		fsnotify_destroy_event(group, &event->fae.fse);
182}
183
184static int process_access_response(struct fsnotify_group *group,
185				   struct fanotify_response *response_struct)
186{
187	struct fanotify_perm_event *event;
188	int fd = response_struct->fd;
189	int response = response_struct->response;
190
191	pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
192		 fd, response);
193	/*
194	 * make sure the response is valid, if invalid we do nothing and either
195	 * userspace can send a valid response or we will clean it up after the
196	 * timeout
197	 */
198	switch (response & ~FAN_AUDIT) {
199	case FAN_ALLOW:
200	case FAN_DENY:
201		break;
202	default:
203		return -EINVAL;
204	}
205
206	if (fd < 0)
207		return -EINVAL;
208
209	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
210		return -EINVAL;
211
212	spin_lock(&group->notification_lock);
213	list_for_each_entry(event, &group->fanotify_data.access_list,
214			    fae.fse.list) {
215		if (event->fd != fd)
216			continue;
217
218		list_del_init(&event->fae.fse.list);
219		finish_permission_event(group, event, response);
220		wake_up(&group->fanotify_data.access_waitq);
221		return 0;
222	}
223	spin_unlock(&group->notification_lock);
224
225	return -ENOENT;
226}
227
228static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
229			     int info_type, const char *name, size_t name_len,
230			     char __user *buf, size_t count)
231{
232	struct fanotify_event_info_fid info = { };
233	struct file_handle handle = { };
234	unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
235	size_t fh_len = fh ? fh->len : 0;
236	size_t info_len = fanotify_fid_info_len(fh_len, name_len);
237	size_t len = info_len;
238
239	pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
240		 __func__, fh_len, name_len, info_len, count);
241
242	if (!fh_len)
243		return 0;
244
245	if (WARN_ON_ONCE(len < sizeof(info) || len > count))
246		return -EFAULT;
247
248	/*
249	 * Copy event info fid header followed by variable sized file handle
250	 * and optionally followed by variable sized filename.
251	 */
252	switch (info_type) {
253	case FAN_EVENT_INFO_TYPE_FID:
254	case FAN_EVENT_INFO_TYPE_DFID:
255		if (WARN_ON_ONCE(name_len))
256			return -EFAULT;
257		break;
258	case FAN_EVENT_INFO_TYPE_DFID_NAME:
259		if (WARN_ON_ONCE(!name || !name_len))
260			return -EFAULT;
261		break;
262	default:
263		return -EFAULT;
264	}
265
266	info.hdr.info_type = info_type;
267	info.hdr.len = len;
268	info.fsid = *fsid;
269	if (copy_to_user(buf, &info, sizeof(info)))
270		return -EFAULT;
271
272	buf += sizeof(info);
273	len -= sizeof(info);
274	if (WARN_ON_ONCE(len < sizeof(handle)))
275		return -EFAULT;
276
277	handle.handle_type = fh->type;
278	handle.handle_bytes = fh_len;
279	if (copy_to_user(buf, &handle, sizeof(handle)))
280		return -EFAULT;
281
282	buf += sizeof(handle);
283	len -= sizeof(handle);
284	if (WARN_ON_ONCE(len < fh_len))
285		return -EFAULT;
286
287	/*
288	 * For an inline fh and inline file name, copy through stack to exclude
289	 * the copy from usercopy hardening protections.
290	 */
291	fh_buf = fanotify_fh_buf(fh);
292	if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
293		memcpy(bounce, fh_buf, fh_len);
294		fh_buf = bounce;
295	}
296	if (copy_to_user(buf, fh_buf, fh_len))
297		return -EFAULT;
298
299	buf += fh_len;
300	len -= fh_len;
301
302	if (name_len) {
303		/* Copy the filename with terminating null */
304		name_len++;
305		if (WARN_ON_ONCE(len < name_len))
306			return -EFAULT;
307
308		if (copy_to_user(buf, name, name_len))
309			return -EFAULT;
310
311		buf += name_len;
312		len -= name_len;
313	}
314
315	/* Pad with 0's */
316	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
317	if (len > 0 && clear_user(buf, len))
318		return -EFAULT;
319
320	return info_len;
321}
322
323static ssize_t copy_event_to_user(struct fsnotify_group *group,
324				  struct fanotify_event *event,
325				  char __user *buf, size_t count)
326{
327	struct fanotify_event_metadata metadata;
328	struct path *path = fanotify_event_path(event);
329	struct fanotify_info *info = fanotify_event_info(event);
330	unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
331	struct file *f = NULL;
332	int ret, fd = FAN_NOFD;
333	int info_type = 0;
334
335	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
336
337	metadata.event_len = FAN_EVENT_METADATA_LEN +
338				fanotify_event_info_len(fid_mode, event);
339	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
340	metadata.vers = FANOTIFY_METADATA_VERSION;
341	metadata.reserved = 0;
342	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
343	metadata.pid = pid_vnr(event->pid);
344
345	if (path && path->mnt && path->dentry) {
346		fd = create_fd(group, path, &f);
347		if (fd < 0)
348			return fd;
349	}
350	metadata.fd = fd;
351
352	ret = -EFAULT;
353	/*
354	 * Sanity check copy size in case get_one_event() and
355	 * event_len sizes ever get out of sync.
356	 */
357	if (WARN_ON_ONCE(metadata.event_len > count))
358		goto out_close_fd;
359
360	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
361		goto out_close_fd;
362
363	buf += FAN_EVENT_METADATA_LEN;
364	count -= FAN_EVENT_METADATA_LEN;
365
366	if (fanotify_is_perm_event(event->mask))
367		FANOTIFY_PERM(event)->fd = fd;
368
369	/* Event info records order is: dir fid + name, child fid */
370	if (fanotify_event_dir_fh_len(event)) {
371		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
372					     FAN_EVENT_INFO_TYPE_DFID;
373		ret = copy_info_to_user(fanotify_event_fsid(event),
374					fanotify_info_dir_fh(info),
375					info_type, fanotify_info_name(info),
376					info->name_len, buf, count);
377		if (ret < 0)
378			goto out_close_fd;
379
380		buf += ret;
381		count -= ret;
382	}
383
384	if (fanotify_event_object_fh_len(event)) {
385		const char *dot = NULL;
386		int dot_len = 0;
387
388		if (fid_mode == FAN_REPORT_FID || info_type) {
389			/*
390			 * With only group flag FAN_REPORT_FID only type FID is
391			 * reported. Second info record type is always FID.
392			 */
393			info_type = FAN_EVENT_INFO_TYPE_FID;
394		} else if ((fid_mode & FAN_REPORT_NAME) &&
395			   (event->mask & FAN_ONDIR)) {
396			/*
397			 * With group flag FAN_REPORT_NAME, if name was not
398			 * recorded in an event on a directory, report the
399			 * name "." with info type DFID_NAME.
400			 */
401			info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
402			dot = ".";
403			dot_len = 1;
404		} else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
405			   (event->mask & FAN_ONDIR)) {
406			/*
407			 * With group flag FAN_REPORT_DIR_FID, a single info
408			 * record has type DFID for directory entry modification
409			 * event and for event on a directory.
410			 */
411			info_type = FAN_EVENT_INFO_TYPE_DFID;
412		} else {
413			/*
414			 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
415			 * a single info record has type FID for event on a
416			 * non-directory, when there is no directory to report.
417			 * For example, on FAN_DELETE_SELF event.
418			 */
419			info_type = FAN_EVENT_INFO_TYPE_FID;
420		}
421
422		ret = copy_info_to_user(fanotify_event_fsid(event),
423					fanotify_event_object_fh(event),
424					info_type, dot, dot_len, buf, count);
425		if (ret < 0)
426			goto out_close_fd;
427
428		buf += ret;
429		count -= ret;
430	}
431
432	if (f)
433		fd_install(fd, f);
434
435	return metadata.event_len;
436
437out_close_fd:
438	if (fd != FAN_NOFD) {
439		put_unused_fd(fd);
440		fput(f);
441	}
442	return ret;
443}
444
445/* intofiy userspace file descriptor functions */
446static __poll_t fanotify_poll(struct file *file, poll_table *wait)
447{
448	struct fsnotify_group *group = file->private_data;
449	__poll_t ret = 0;
450
451	poll_wait(file, &group->notification_waitq, wait);
452	spin_lock(&group->notification_lock);
453	if (!fsnotify_notify_queue_is_empty(group))
454		ret = EPOLLIN | EPOLLRDNORM;
455	spin_unlock(&group->notification_lock);
456
457	return ret;
458}
459
460static ssize_t fanotify_read(struct file *file, char __user *buf,
461			     size_t count, loff_t *pos)
462{
463	struct fsnotify_group *group;
464	struct fanotify_event *event;
465	char __user *start;
466	int ret;
467	DEFINE_WAIT_FUNC(wait, woken_wake_function);
468
469	start = buf;
470	group = file->private_data;
471
472	pr_debug("%s: group=%p\n", __func__, group);
473
474	add_wait_queue(&group->notification_waitq, &wait);
475	while (1) {
476		/*
477		 * User can supply arbitrarily large buffer. Avoid softlockups
478		 * in case there are lots of available events.
479		 */
480		cond_resched();
481		event = get_one_event(group, count);
482		if (IS_ERR(event)) {
483			ret = PTR_ERR(event);
484			break;
485		}
486
487		if (!event) {
488			ret = -EAGAIN;
489			if (file->f_flags & O_NONBLOCK)
490				break;
491
492			ret = -ERESTARTSYS;
493			if (signal_pending(current))
494				break;
495
496			if (start != buf)
497				break;
498
499			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
500			continue;
501		}
502
503		ret = copy_event_to_user(group, event, buf, count);
504		if (unlikely(ret == -EOPENSTALE)) {
505			/*
506			 * We cannot report events with stale fd so drop it.
507			 * Setting ret to 0 will continue the event loop and
508			 * do the right thing if there are no more events to
509			 * read (i.e. return bytes read, -EAGAIN or wait).
510			 */
511			ret = 0;
512		}
513
514		/*
515		 * Permission events get queued to wait for response.  Other
516		 * events can be destroyed now.
517		 */
518		if (!fanotify_is_perm_event(event->mask)) {
519			fsnotify_destroy_event(group, &event->fse);
520		} else {
521			if (ret <= 0) {
522				spin_lock(&group->notification_lock);
523				finish_permission_event(group,
524					FANOTIFY_PERM(event), FAN_DENY);
525				wake_up(&group->fanotify_data.access_waitq);
526			} else {
527				spin_lock(&group->notification_lock);
528				list_add_tail(&event->fse.list,
529					&group->fanotify_data.access_list);
530				spin_unlock(&group->notification_lock);
531			}
532		}
533		if (ret < 0)
534			break;
535		buf += ret;
536		count -= ret;
537	}
538	remove_wait_queue(&group->notification_waitq, &wait);
539
540	if (start != buf && ret != -EFAULT)
541		ret = buf - start;
542	return ret;
543}
544
545static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
546{
547	struct fanotify_response response = { .fd = -1, .response = -1 };
548	struct fsnotify_group *group;
549	int ret;
550
551	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
552		return -EINVAL;
553
554	group = file->private_data;
555
556	if (count < sizeof(response))
557		return -EINVAL;
558
559	count = sizeof(response);
560
561	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
562
563	if (copy_from_user(&response, buf, count))
564		return -EFAULT;
565
566	ret = process_access_response(group, &response);
567	if (ret < 0)
568		count = ret;
569
570	return count;
571}
572
573static int fanotify_release(struct inode *ignored, struct file *file)
574{
575	struct fsnotify_group *group = file->private_data;
576
577	/*
578	 * Stop new events from arriving in the notification queue. since
579	 * userspace cannot use fanotify fd anymore, no event can enter or
580	 * leave access_list by now either.
581	 */
582	fsnotify_group_stop_queueing(group);
583
584	/*
585	 * Process all permission events on access_list and notification queue
586	 * and simulate reply from userspace.
587	 */
588	spin_lock(&group->notification_lock);
589	while (!list_empty(&group->fanotify_data.access_list)) {
590		struct fanotify_perm_event *event;
591
592		event = list_first_entry(&group->fanotify_data.access_list,
593				struct fanotify_perm_event, fae.fse.list);
594		list_del_init(&event->fae.fse.list);
595		finish_permission_event(group, event, FAN_ALLOW);
596		spin_lock(&group->notification_lock);
597	}
598
599	/*
600	 * Destroy all non-permission events. For permission events just
601	 * dequeue them and set the response. They will be freed once the
602	 * response is consumed and fanotify_get_response() returns.
603	 */
604	while (!fsnotify_notify_queue_is_empty(group)) {
605		struct fanotify_event *event;
606
607		event = FANOTIFY_E(fsnotify_remove_first_event(group));
608		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
609			spin_unlock(&group->notification_lock);
610			fsnotify_destroy_event(group, &event->fse);
611		} else {
612			finish_permission_event(group, FANOTIFY_PERM(event),
613						FAN_ALLOW);
614		}
615		spin_lock(&group->notification_lock);
616	}
617	spin_unlock(&group->notification_lock);
618
619	/* Response for all permission events it set, wakeup waiters */
620	wake_up(&group->fanotify_data.access_waitq);
621
622	/* matches the fanotify_init->fsnotify_alloc_group */
623	fsnotify_destroy_group(group);
624
625	return 0;
626}
627
628static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
629{
630	struct fsnotify_group *group;
631	struct fsnotify_event *fsn_event;
632	void __user *p;
633	int ret = -ENOTTY;
634	size_t send_len = 0;
635
636	group = file->private_data;
637
638	p = (void __user *) arg;
639
640	switch (cmd) {
641	case FIONREAD:
642		spin_lock(&group->notification_lock);
643		list_for_each_entry(fsn_event, &group->notification_list, list)
644			send_len += FAN_EVENT_METADATA_LEN;
645		spin_unlock(&group->notification_lock);
646		ret = put_user(send_len, (int __user *) p);
647		break;
648	}
649
650	return ret;
651}
652
653static const struct file_operations fanotify_fops = {
654	.show_fdinfo	= fanotify_show_fdinfo,
655	.poll		= fanotify_poll,
656	.read		= fanotify_read,
657	.write		= fanotify_write,
658	.fasync		= NULL,
659	.release	= fanotify_release,
660	.unlocked_ioctl	= fanotify_ioctl,
661	.compat_ioctl	= compat_ptr_ioctl,
662	.llseek		= noop_llseek,
663};
664
665static int fanotify_find_path(int dfd, const char __user *filename,
666			      struct path *path, unsigned int flags, __u64 mask,
667			      unsigned int obj_type)
668{
669	int ret;
670
671	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
672		 dfd, filename, flags);
673
674	if (filename == NULL) {
675		struct fd f = fdget(dfd);
676
677		ret = -EBADF;
678		if (!f.file)
679			goto out;
680
681		ret = -ENOTDIR;
682		if ((flags & FAN_MARK_ONLYDIR) &&
683		    !(S_ISDIR(file_inode(f.file)->i_mode))) {
684			fdput(f);
685			goto out;
686		}
687
688		*path = f.file->f_path;
689		path_get(path);
690		fdput(f);
691	} else {
692		unsigned int lookup_flags = 0;
693
694		if (!(flags & FAN_MARK_DONT_FOLLOW))
695			lookup_flags |= LOOKUP_FOLLOW;
696		if (flags & FAN_MARK_ONLYDIR)
697			lookup_flags |= LOOKUP_DIRECTORY;
698
699		ret = user_path_at(dfd, filename, lookup_flags, path);
700		if (ret)
701			goto out;
702	}
703
704	/* you can only watch an inode if you have read permissions on it */
705	ret = inode_permission(path->dentry->d_inode, MAY_READ);
706	if (ret) {
707		path_put(path);
708		goto out;
709	}
710
711	ret = security_path_notify(path, mask, obj_type);
712	if (ret)
713		path_put(path);
714
715out:
716	return ret;
717}
718
719static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
720					    __u32 mask, unsigned int flags,
721					    __u32 umask, int *destroy)
722{
723	__u32 oldmask = 0;
724
725	/* umask bits cannot be removed by user */
726	mask &= ~umask;
727	spin_lock(&fsn_mark->lock);
728	if (!(flags & FAN_MARK_IGNORED_MASK)) {
729		oldmask = fsn_mark->mask;
730		fsn_mark->mask &= ~mask;
731	} else {
732		fsn_mark->ignored_mask &= ~mask;
733	}
734	/*
735	 * We need to keep the mark around even if remaining mask cannot
736	 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
737	 * changes to the mask.
738	 * Destroy mark when only umask bits remain.
739	 */
740	*destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
741	spin_unlock(&fsn_mark->lock);
742
743	return mask & oldmask;
744}
745
746static int fanotify_remove_mark(struct fsnotify_group *group,
747				fsnotify_connp_t *connp, __u32 mask,
748				unsigned int flags, __u32 umask)
749{
750	struct fsnotify_mark *fsn_mark = NULL;
751	__u32 removed;
752	int destroy_mark;
753
754	mutex_lock(&group->mark_mutex);
755	fsn_mark = fsnotify_find_mark(connp, group);
756	if (!fsn_mark) {
757		mutex_unlock(&group->mark_mutex);
758		return -ENOENT;
759	}
760
761	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
762						 umask, &destroy_mark);
763	if (removed & fsnotify_conn_mask(fsn_mark->connector))
764		fsnotify_recalc_mask(fsn_mark->connector);
765	if (destroy_mark)
766		fsnotify_detach_mark(fsn_mark);
767	mutex_unlock(&group->mark_mutex);
768	if (destroy_mark)
769		fsnotify_free_mark(fsn_mark);
770
771	/* matches the fsnotify_find_mark() */
772	fsnotify_put_mark(fsn_mark);
773	return 0;
774}
775
776static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
777					 struct vfsmount *mnt, __u32 mask,
778					 unsigned int flags, __u32 umask)
779{
780	return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
781				    mask, flags, umask);
782}
783
784static int fanotify_remove_sb_mark(struct fsnotify_group *group,
785				   struct super_block *sb, __u32 mask,
786				   unsigned int flags, __u32 umask)
787{
788	return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
789				    flags, umask);
790}
791
792static int fanotify_remove_inode_mark(struct fsnotify_group *group,
793				      struct inode *inode, __u32 mask,
794				      unsigned int flags, __u32 umask)
795{
796	return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
797				    flags, umask);
798}
799
800static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
801				       __u32 mask,
802				       unsigned int flags)
803{
804	__u32 oldmask = -1;
805
806	spin_lock(&fsn_mark->lock);
807	if (!(flags & FAN_MARK_IGNORED_MASK)) {
808		oldmask = fsn_mark->mask;
809		fsn_mark->mask |= mask;
810	} else {
811		fsn_mark->ignored_mask |= mask;
812		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
813			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
814	}
815	spin_unlock(&fsn_mark->lock);
816
817	return mask & ~oldmask;
818}
819
820static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
821						   fsnotify_connp_t *connp,
822						   unsigned int type,
823						   __kernel_fsid_t *fsid)
824{
825	struct fsnotify_mark *mark;
826	int ret;
827
828	if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
829		return ERR_PTR(-ENOSPC);
830
831	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
832	if (!mark)
833		return ERR_PTR(-ENOMEM);
834
835	fsnotify_init_mark(mark, group);
836	ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
837	if (ret) {
838		fsnotify_put_mark(mark);
839		return ERR_PTR(ret);
840	}
841
842	return mark;
843}
844
845
846static int fanotify_add_mark(struct fsnotify_group *group,
847			     fsnotify_connp_t *connp, unsigned int type,
848			     __u32 mask, unsigned int flags,
849			     __kernel_fsid_t *fsid)
850{
851	struct fsnotify_mark *fsn_mark;
852	__u32 added;
853
854	mutex_lock(&group->mark_mutex);
855	fsn_mark = fsnotify_find_mark(connp, group);
856	if (!fsn_mark) {
857		fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
858		if (IS_ERR(fsn_mark)) {
859			mutex_unlock(&group->mark_mutex);
860			return PTR_ERR(fsn_mark);
861		}
862	}
863	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
864	if (added & ~fsnotify_conn_mask(fsn_mark->connector))
865		fsnotify_recalc_mask(fsn_mark->connector);
866	mutex_unlock(&group->mark_mutex);
867
868	fsnotify_put_mark(fsn_mark);
869	return 0;
870}
871
872static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
873				      struct vfsmount *mnt, __u32 mask,
874				      unsigned int flags, __kernel_fsid_t *fsid)
875{
876	return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
877				 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
878}
879
880static int fanotify_add_sb_mark(struct fsnotify_group *group,
881				struct super_block *sb, __u32 mask,
882				unsigned int flags, __kernel_fsid_t *fsid)
883{
884	return fanotify_add_mark(group, &sb->s_fsnotify_marks,
885				 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
886}
887
888static int fanotify_add_inode_mark(struct fsnotify_group *group,
889				   struct inode *inode, __u32 mask,
890				   unsigned int flags, __kernel_fsid_t *fsid)
891{
892	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
893
894	/*
895	 * If some other task has this inode open for write we should not add
896	 * an ignored mark, unless that ignored mark is supposed to survive
897	 * modification changes anyway.
898	 */
899	if ((flags & FAN_MARK_IGNORED_MASK) &&
900	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
901	    inode_is_open_for_write(inode))
902		return 0;
903
904	return fanotify_add_mark(group, &inode->i_fsnotify_marks,
905				 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
906}
907
908static struct fsnotify_event *fanotify_alloc_overflow_event(void)
909{
910	struct fanotify_event *oevent;
911
912	oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
913	if (!oevent)
914		return NULL;
915
916	fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
917	oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
918
919	return &oevent->fse;
920}
921
922/* fanotify syscalls */
923SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
924{
925	struct fsnotify_group *group;
926	int f_flags, fd;
927	struct user_struct *user;
928	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
929	unsigned int class = flags & FANOTIFY_CLASS_BITS;
930
931	pr_debug("%s: flags=%x event_f_flags=%x\n",
932		 __func__, flags, event_f_flags);
933
934	if (!capable(CAP_SYS_ADMIN))
935		return -EPERM;
936
937#ifdef CONFIG_AUDITSYSCALL
938	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
939#else
940	if (flags & ~FANOTIFY_INIT_FLAGS)
941#endif
942		return -EINVAL;
943
944	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
945		return -EINVAL;
946
947	switch (event_f_flags & O_ACCMODE) {
948	case O_RDONLY:
949	case O_RDWR:
950	case O_WRONLY:
951		break;
952	default:
953		return -EINVAL;
954	}
955
956	if (fid_mode && class != FAN_CLASS_NOTIF)
957		return -EINVAL;
958
959	/*
960	 * Child name is reported with parent fid so requires dir fid.
961	 * We can report both child fid and dir fid with or without name.
962	 */
963	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
964		return -EINVAL;
965
966	user = get_current_user();
967	if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
968		free_uid(user);
969		return -EMFILE;
970	}
971
972	f_flags = O_RDWR | FMODE_NONOTIFY;
973	if (flags & FAN_CLOEXEC)
974		f_flags |= O_CLOEXEC;
975	if (flags & FAN_NONBLOCK)
976		f_flags |= O_NONBLOCK;
977
978	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
979	group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
980	if (IS_ERR(group)) {
981		free_uid(user);
982		return PTR_ERR(group);
983	}
984
985	group->fanotify_data.user = user;
986	group->fanotify_data.flags = flags;
987	atomic_inc(&user->fanotify_listeners);
988	group->memcg = get_mem_cgroup_from_mm(current->mm);
989
990	group->overflow_event = fanotify_alloc_overflow_event();
991	if (unlikely(!group->overflow_event)) {
992		fd = -ENOMEM;
993		goto out_destroy_group;
994	}
995
996	if (force_o_largefile())
997		event_f_flags |= O_LARGEFILE;
998	group->fanotify_data.f_flags = event_f_flags;
999	init_waitqueue_head(&group->fanotify_data.access_waitq);
1000	INIT_LIST_HEAD(&group->fanotify_data.access_list);
1001	switch (class) {
1002	case FAN_CLASS_NOTIF:
1003		group->priority = FS_PRIO_0;
1004		break;
1005	case FAN_CLASS_CONTENT:
1006		group->priority = FS_PRIO_1;
1007		break;
1008	case FAN_CLASS_PRE_CONTENT:
1009		group->priority = FS_PRIO_2;
1010		break;
1011	default:
1012		fd = -EINVAL;
1013		goto out_destroy_group;
1014	}
1015
1016	if (flags & FAN_UNLIMITED_QUEUE) {
1017		fd = -EPERM;
1018		if (!capable(CAP_SYS_ADMIN))
1019			goto out_destroy_group;
1020		group->max_events = UINT_MAX;
1021	} else {
1022		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1023	}
1024
1025	if (flags & FAN_UNLIMITED_MARKS) {
1026		fd = -EPERM;
1027		if (!capable(CAP_SYS_ADMIN))
1028			goto out_destroy_group;
1029		group->fanotify_data.max_marks = UINT_MAX;
1030	} else {
1031		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
1032	}
1033
1034	if (flags & FAN_ENABLE_AUDIT) {
1035		fd = -EPERM;
1036		if (!capable(CAP_AUDIT_WRITE))
1037			goto out_destroy_group;
1038	}
1039
1040	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
1041	if (fd < 0)
1042		goto out_destroy_group;
1043
1044	return fd;
1045
1046out_destroy_group:
1047	fsnotify_destroy_group(group);
1048	return fd;
1049}
1050
1051/* Check if filesystem can encode a unique fid */
1052static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
1053{
1054	__kernel_fsid_t root_fsid;
1055	int err;
1056
1057	/*
1058	 * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
1059	 */
1060	err = vfs_get_fsid(path->dentry, fsid);
1061	if (err)
1062		return err;
1063
1064	if (!fsid->val[0] && !fsid->val[1])
1065		return -ENODEV;
1066
1067	/*
1068	 * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
1069	 * which uses a different fsid than sb root.
1070	 */
1071	err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
1072	if (err)
1073		return err;
1074
1075	if (root_fsid.val[0] != fsid->val[0] ||
1076	    root_fsid.val[1] != fsid->val[1])
1077		return -EXDEV;
1078
1079	/*
1080	 * We need to make sure that the file system supports at least
1081	 * encoding a file handle so user can use name_to_handle_at() to
1082	 * compare fid returned with event to the file handle of watched
1083	 * objects. However, name_to_handle_at() requires that the
1084	 * filesystem also supports decoding file handles.
1085	 */
1086	if (!path->dentry->d_sb->s_export_op ||
1087	    !path->dentry->d_sb->s_export_op->fh_to_dentry)
1088		return -EOPNOTSUPP;
1089
1090	return 0;
1091}
1092
1093static int fanotify_events_supported(struct path *path, __u64 mask,
1094				     unsigned int flags)
1095{
1096	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1097
1098	/*
1099	 * Some filesystems such as 'proc' acquire unusual locks when opening
1100	 * files. For them fanotify permission events have high chances of
1101	 * deadlocking the system - open done when reporting fanotify event
1102	 * blocks on this "unusual" lock while another process holding the lock
1103	 * waits for fanotify permission event to be answered. Just disallow
1104	 * permission events for such filesystems.
1105	 */
1106	if (mask & FANOTIFY_PERM_EVENTS &&
1107	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1108		return -EINVAL;
1109
1110	/*
1111	 * mount and sb marks are not allowed on kernel internal pseudo fs,
1112	 * like pipe_mnt, because that would subscribe to events on all the
1113	 * anonynous pipes in the system.
1114	 *
1115	 * SB_NOUSER covers all of the internal pseudo fs whose objects are not
1116	 * exposed to user's mount namespace, but there are other SB_KERNMOUNT
1117	 * fs, like nsfs, debugfs, for which the value of allowing sb and mount
1118	 * mark is questionable. For now we leave them alone.
1119	 */
1120	if (mark_type != FAN_MARK_INODE &&
1121	    path->mnt->mnt_sb->s_flags & SB_NOUSER)
1122		return -EINVAL;
1123
1124	return 0;
1125}
1126
1127static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1128			    int dfd, const char  __user *pathname)
1129{
1130	struct inode *inode = NULL;
1131	struct vfsmount *mnt = NULL;
1132	struct fsnotify_group *group;
1133	struct fd f;
1134	struct path path;
1135	__kernel_fsid_t __fsid, *fsid = NULL;
1136	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1137	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1138	bool ignored = flags & FAN_MARK_IGNORED_MASK;
1139	unsigned int obj_type, fid_mode;
1140	u32 umask = 0;
1141	int ret;
1142
1143	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1144		 __func__, fanotify_fd, flags, dfd, pathname, mask);
1145
1146	/* we only use the lower 32 bits as of right now. */
1147	if (mask & ((__u64)0xffffffff << 32))
1148		return -EINVAL;
1149
1150	if (flags & ~FANOTIFY_MARK_FLAGS)
1151		return -EINVAL;
1152
1153	switch (mark_type) {
1154	case FAN_MARK_INODE:
1155		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1156		break;
1157	case FAN_MARK_MOUNT:
1158		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1159		break;
1160	case FAN_MARK_FILESYSTEM:
1161		obj_type = FSNOTIFY_OBJ_TYPE_SB;
1162		break;
1163	default:
1164		return -EINVAL;
1165	}
1166
1167	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
1168	case FAN_MARK_ADD:
1169	case FAN_MARK_REMOVE:
1170		if (!mask)
1171			return -EINVAL;
1172		break;
1173	case FAN_MARK_FLUSH:
1174		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1175			return -EINVAL;
1176		break;
1177	default:
1178		return -EINVAL;
1179	}
1180
1181	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1182		valid_mask |= FANOTIFY_PERM_EVENTS;
1183
1184	if (mask & ~valid_mask)
1185		return -EINVAL;
1186
1187	/* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
1188	if (ignored)
1189		mask &= ~FANOTIFY_EVENT_FLAGS;
1190
1191	f = fdget(fanotify_fd);
1192	if (unlikely(!f.file))
1193		return -EBADF;
1194
1195	/* verify that this is indeed an fanotify instance */
1196	ret = -EINVAL;
1197	if (unlikely(f.file->f_op != &fanotify_fops))
1198		goto fput_and_out;
1199	group = f.file->private_data;
1200
1201	/*
1202	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
1203	 * allowed to set permissions events.
1204	 */
1205	ret = -EINVAL;
1206	if (mask & FANOTIFY_PERM_EVENTS &&
1207	    group->priority == FS_PRIO_0)
1208		goto fput_and_out;
1209
1210	/*
1211	 * Events with data type inode do not carry enough information to report
1212	 * event->fd, so we do not allow setting a mask for inode events unless
1213	 * group supports reporting fid.
1214	 * inode events are not supported on a mount mark, because they do not
1215	 * carry enough information (i.e. path) to be filtered by mount point.
1216	 */
1217	fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1218	if (mask & FANOTIFY_INODE_EVENTS &&
1219	    (!fid_mode || mark_type == FAN_MARK_MOUNT))
1220		goto fput_and_out;
1221
1222	if (flags & FAN_MARK_FLUSH) {
1223		ret = 0;
1224		if (mark_type == FAN_MARK_MOUNT)
1225			fsnotify_clear_vfsmount_marks_by_group(group);
1226		else if (mark_type == FAN_MARK_FILESYSTEM)
1227			fsnotify_clear_sb_marks_by_group(group);
1228		else
1229			fsnotify_clear_inode_marks_by_group(group);
1230		goto fput_and_out;
1231	}
1232
1233	ret = fanotify_find_path(dfd, pathname, &path, flags,
1234			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
1235	if (ret)
1236		goto fput_and_out;
1237
1238	if (flags & FAN_MARK_ADD) {
1239		ret = fanotify_events_supported(&path, mask, flags);
1240		if (ret)
1241			goto path_put_and_out;
1242	}
1243
1244	if (fid_mode) {
1245		ret = fanotify_test_fid(&path, &__fsid);
1246		if (ret)
1247			goto path_put_and_out;
1248
1249		fsid = &__fsid;
1250	}
1251
1252	/* inode held in place by reference to path; group by fget on fd */
1253	if (mark_type == FAN_MARK_INODE)
1254		inode = path.dentry->d_inode;
1255	else
1256		mnt = path.mnt;
1257
1258	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
1259	if (mnt || !S_ISDIR(inode->i_mode)) {
1260		mask &= ~FAN_EVENT_ON_CHILD;
1261		umask = FAN_EVENT_ON_CHILD;
1262		/*
1263		 * If group needs to report parent fid, register for getting
1264		 * events with parent/name info for non-directory.
1265		 */
1266		if ((fid_mode & FAN_REPORT_DIR_FID) &&
1267		    (flags & FAN_MARK_ADD) && !ignored)
1268			mask |= FAN_EVENT_ON_CHILD;
1269	}
1270
1271	/* create/update an inode mark */
1272	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
1273	case FAN_MARK_ADD:
1274		if (mark_type == FAN_MARK_MOUNT)
1275			ret = fanotify_add_vfsmount_mark(group, mnt, mask,
1276							 flags, fsid);
1277		else if (mark_type == FAN_MARK_FILESYSTEM)
1278			ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
1279						   flags, fsid);
1280		else
1281			ret = fanotify_add_inode_mark(group, inode, mask,
1282						      flags, fsid);
1283		break;
1284	case FAN_MARK_REMOVE:
1285		if (mark_type == FAN_MARK_MOUNT)
1286			ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
1287							    flags, umask);
1288		else if (mark_type == FAN_MARK_FILESYSTEM)
1289			ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
1290						      flags, umask);
1291		else
1292			ret = fanotify_remove_inode_mark(group, inode, mask,
1293							 flags, umask);
1294		break;
1295	default:
1296		ret = -EINVAL;
1297	}
1298
1299path_put_and_out:
1300	path_put(&path);
1301fput_and_out:
1302	fdput(f);
1303	return ret;
1304}
1305
1306#ifndef CONFIG_ARCH_SPLIT_ARG64
1307SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1308			      __u64, mask, int, dfd,
1309			      const char  __user *, pathname)
1310{
1311	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1312}
1313#endif
1314
1315#if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
1316SYSCALL32_DEFINE6(fanotify_mark,
1317				int, fanotify_fd, unsigned int, flags,
1318				SC_ARG64(mask), int, dfd,
1319				const char  __user *, pathname)
1320{
1321	return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
1322				dfd, pathname);
1323}
1324#endif
1325
1326/*
1327 * fanotify_user_setup - Our initialization function.  Note that we cannot return
1328 * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
1329 * must result in panic().
1330 */
1331static int __init fanotify_user_setup(void)
1332{
1333	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
1334	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
1335
1336	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
1337					 SLAB_PANIC|SLAB_ACCOUNT);
1338	fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1339					       SLAB_PANIC);
1340	fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1341						SLAB_PANIC);
1342	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1343		fanotify_perm_event_cachep =
1344			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1345	}
1346
1347	return 0;
1348}
1349device_initcall(fanotify_user_setup);
1350