162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/* Watch queue and general notification mechanism, built on pipes
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
562306a36Sopenharmony_ci * Written by David Howells (dhowells@redhat.com)
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * See Documentation/core-api/watch_queue.rst
862306a36Sopenharmony_ci */
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#define pr_fmt(fmt) "watchq: " fmt
1162306a36Sopenharmony_ci#include <linux/module.h>
1262306a36Sopenharmony_ci#include <linux/init.h>
1362306a36Sopenharmony_ci#include <linux/sched.h>
1462306a36Sopenharmony_ci#include <linux/slab.h>
1562306a36Sopenharmony_ci#include <linux/printk.h>
1662306a36Sopenharmony_ci#include <linux/miscdevice.h>
1762306a36Sopenharmony_ci#include <linux/fs.h>
1862306a36Sopenharmony_ci#include <linux/mm.h>
1962306a36Sopenharmony_ci#include <linux/pagemap.h>
2062306a36Sopenharmony_ci#include <linux/poll.h>
2162306a36Sopenharmony_ci#include <linux/uaccess.h>
2262306a36Sopenharmony_ci#include <linux/vmalloc.h>
2362306a36Sopenharmony_ci#include <linux/file.h>
2462306a36Sopenharmony_ci#include <linux/security.h>
2562306a36Sopenharmony_ci#include <linux/cred.h>
2662306a36Sopenharmony_ci#include <linux/sched/signal.h>
2762306a36Sopenharmony_ci#include <linux/watch_queue.h>
2862306a36Sopenharmony_ci#include <linux/pipe_fs_i.h>
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ciMODULE_DESCRIPTION("Watch queue");
3162306a36Sopenharmony_ciMODULE_AUTHOR("Red Hat, Inc.");
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci#define WATCH_QUEUE_NOTE_SIZE 128
3462306a36Sopenharmony_ci#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci/*
3762306a36Sopenharmony_ci * This must be called under the RCU read-lock, which makes
3862306a36Sopenharmony_ci * sure that the wqueue still exists. It can then take the lock,
3962306a36Sopenharmony_ci * and check that the wqueue hasn't been destroyed, which in
4062306a36Sopenharmony_ci * turn makes sure that the notification pipe still exists.
4162306a36Sopenharmony_ci */
4262306a36Sopenharmony_cistatic inline bool lock_wqueue(struct watch_queue *wqueue)
4362306a36Sopenharmony_ci{
4462306a36Sopenharmony_ci	spin_lock_bh(&wqueue->lock);
4562306a36Sopenharmony_ci	if (unlikely(!wqueue->pipe)) {
4662306a36Sopenharmony_ci		spin_unlock_bh(&wqueue->lock);
4762306a36Sopenharmony_ci		return false;
4862306a36Sopenharmony_ci	}
4962306a36Sopenharmony_ci	return true;
5062306a36Sopenharmony_ci}
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_cistatic inline void unlock_wqueue(struct watch_queue *wqueue)
5362306a36Sopenharmony_ci{
5462306a36Sopenharmony_ci	spin_unlock_bh(&wqueue->lock);
5562306a36Sopenharmony_ci}
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_cistatic void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
5862306a36Sopenharmony_ci					 struct pipe_buffer *buf)
5962306a36Sopenharmony_ci{
6062306a36Sopenharmony_ci	struct watch_queue *wqueue = (struct watch_queue *)buf->private;
6162306a36Sopenharmony_ci	struct page *page;
6262306a36Sopenharmony_ci	unsigned int bit;
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci	/* We need to work out which note within the page this refers to, but
6562306a36Sopenharmony_ci	 * the note might have been maximum size, so merely ANDing the offset
6662306a36Sopenharmony_ci	 * off doesn't work.  OTOH, the note must've been more than zero size.
6762306a36Sopenharmony_ci	 */
6862306a36Sopenharmony_ci	bit = buf->offset + buf->len;
6962306a36Sopenharmony_ci	if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0)
7062306a36Sopenharmony_ci		bit -= WATCH_QUEUE_NOTE_SIZE;
7162306a36Sopenharmony_ci	bit /= WATCH_QUEUE_NOTE_SIZE;
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci	page = buf->page;
7462306a36Sopenharmony_ci	bit += page->index;
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci	set_bit(bit, wqueue->notes_bitmap);
7762306a36Sopenharmony_ci	generic_pipe_buf_release(pipe, buf);
7862306a36Sopenharmony_ci}
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci// No try_steal function => no stealing
8162306a36Sopenharmony_ci#define watch_queue_pipe_buf_try_steal NULL
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci/* New data written to a pipe may be appended to a buffer with this type. */
8462306a36Sopenharmony_cistatic const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
8562306a36Sopenharmony_ci	.release	= watch_queue_pipe_buf_release,
8662306a36Sopenharmony_ci	.try_steal	= watch_queue_pipe_buf_try_steal,
8762306a36Sopenharmony_ci	.get		= generic_pipe_buf_get,
8862306a36Sopenharmony_ci};
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci/*
9162306a36Sopenharmony_ci * Post a notification to a watch queue.
9262306a36Sopenharmony_ci *
9362306a36Sopenharmony_ci * Must be called with the RCU lock for reading, and the
9462306a36Sopenharmony_ci * watch_queue lock held, which guarantees that the pipe
9562306a36Sopenharmony_ci * hasn't been released.
9662306a36Sopenharmony_ci */
9762306a36Sopenharmony_cistatic bool post_one_notification(struct watch_queue *wqueue,
9862306a36Sopenharmony_ci				  struct watch_notification *n)
9962306a36Sopenharmony_ci{
10062306a36Sopenharmony_ci	void *p;
10162306a36Sopenharmony_ci	struct pipe_inode_info *pipe = wqueue->pipe;
10262306a36Sopenharmony_ci	struct pipe_buffer *buf;
10362306a36Sopenharmony_ci	struct page *page;
10462306a36Sopenharmony_ci	unsigned int head, tail, mask, note, offset, len;
10562306a36Sopenharmony_ci	bool done = false;
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci	spin_lock_irq(&pipe->rd_wait.lock);
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	mask = pipe->ring_size - 1;
11062306a36Sopenharmony_ci	head = pipe->head;
11162306a36Sopenharmony_ci	tail = pipe->tail;
11262306a36Sopenharmony_ci	if (pipe_full(head, tail, pipe->ring_size))
11362306a36Sopenharmony_ci		goto lost;
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci	note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes);
11662306a36Sopenharmony_ci	if (note >= wqueue->nr_notes)
11762306a36Sopenharmony_ci		goto lost;
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci	page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE];
12062306a36Sopenharmony_ci	offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE;
12162306a36Sopenharmony_ci	get_page(page);
12262306a36Sopenharmony_ci	len = n->info & WATCH_INFO_LENGTH;
12362306a36Sopenharmony_ci	p = kmap_atomic(page);
12462306a36Sopenharmony_ci	memcpy(p + offset, n, len);
12562306a36Sopenharmony_ci	kunmap_atomic(p);
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci	buf = &pipe->bufs[head & mask];
12862306a36Sopenharmony_ci	buf->page = page;
12962306a36Sopenharmony_ci	buf->private = (unsigned long)wqueue;
13062306a36Sopenharmony_ci	buf->ops = &watch_queue_pipe_buf_ops;
13162306a36Sopenharmony_ci	buf->offset = offset;
13262306a36Sopenharmony_ci	buf->len = len;
13362306a36Sopenharmony_ci	buf->flags = PIPE_BUF_FLAG_WHOLE;
13462306a36Sopenharmony_ci	smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci	if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
13762306a36Sopenharmony_ci		spin_unlock_irq(&pipe->rd_wait.lock);
13862306a36Sopenharmony_ci		BUG();
13962306a36Sopenharmony_ci	}
14062306a36Sopenharmony_ci	wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
14162306a36Sopenharmony_ci	done = true;
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ciout:
14462306a36Sopenharmony_ci	spin_unlock_irq(&pipe->rd_wait.lock);
14562306a36Sopenharmony_ci	if (done)
14662306a36Sopenharmony_ci		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
14762306a36Sopenharmony_ci	return done;
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_cilost:
15062306a36Sopenharmony_ci	buf = &pipe->bufs[(head - 1) & mask];
15162306a36Sopenharmony_ci	buf->flags |= PIPE_BUF_FLAG_LOSS;
15262306a36Sopenharmony_ci	goto out;
15362306a36Sopenharmony_ci}
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci/*
15662306a36Sopenharmony_ci * Apply filter rules to a notification.
15762306a36Sopenharmony_ci */
15862306a36Sopenharmony_cistatic bool filter_watch_notification(const struct watch_filter *wf,
15962306a36Sopenharmony_ci				      const struct watch_notification *n)
16062306a36Sopenharmony_ci{
16162306a36Sopenharmony_ci	const struct watch_type_filter *wt;
16262306a36Sopenharmony_ci	unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8;
16362306a36Sopenharmony_ci	unsigned int st_index = n->subtype / st_bits;
16462306a36Sopenharmony_ci	unsigned int st_bit = 1U << (n->subtype % st_bits);
16562306a36Sopenharmony_ci	int i;
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_ci	if (!test_bit(n->type, wf->type_filter))
16862306a36Sopenharmony_ci		return false;
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	for (i = 0; i < wf->nr_filters; i++) {
17162306a36Sopenharmony_ci		wt = &wf->filters[i];
17262306a36Sopenharmony_ci		if (n->type == wt->type &&
17362306a36Sopenharmony_ci		    (wt->subtype_filter[st_index] & st_bit) &&
17462306a36Sopenharmony_ci		    (n->info & wt->info_mask) == wt->info_filter)
17562306a36Sopenharmony_ci			return true;
17662306a36Sopenharmony_ci	}
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	return false; /* If there is a filter, the default is to reject. */
17962306a36Sopenharmony_ci}
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci/**
18262306a36Sopenharmony_ci * __post_watch_notification - Post an event notification
18362306a36Sopenharmony_ci * @wlist: The watch list to post the event to.
18462306a36Sopenharmony_ci * @n: The notification record to post.
18562306a36Sopenharmony_ci * @cred: The creds of the process that triggered the notification.
18662306a36Sopenharmony_ci * @id: The ID to match on the watch.
18762306a36Sopenharmony_ci *
18862306a36Sopenharmony_ci * Post a notification of an event into a set of watch queues and let the users
18962306a36Sopenharmony_ci * know.
19062306a36Sopenharmony_ci *
19162306a36Sopenharmony_ci * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and
19262306a36Sopenharmony_ci * should be in units of sizeof(*n).
19362306a36Sopenharmony_ci */
19462306a36Sopenharmony_civoid __post_watch_notification(struct watch_list *wlist,
19562306a36Sopenharmony_ci			       struct watch_notification *n,
19662306a36Sopenharmony_ci			       const struct cred *cred,
19762306a36Sopenharmony_ci			       u64 id)
19862306a36Sopenharmony_ci{
19962306a36Sopenharmony_ci	const struct watch_filter *wf;
20062306a36Sopenharmony_ci	struct watch_queue *wqueue;
20162306a36Sopenharmony_ci	struct watch *watch;
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci	if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) {
20462306a36Sopenharmony_ci		WARN_ON(1);
20562306a36Sopenharmony_ci		return;
20662306a36Sopenharmony_ci	}
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci	rcu_read_lock();
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) {
21162306a36Sopenharmony_ci		if (watch->id != id)
21262306a36Sopenharmony_ci			continue;
21362306a36Sopenharmony_ci		n->info &= ~WATCH_INFO_ID;
21462306a36Sopenharmony_ci		n->info |= watch->info_id;
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci		wqueue = rcu_dereference(watch->queue);
21762306a36Sopenharmony_ci		wf = rcu_dereference(wqueue->filter);
21862306a36Sopenharmony_ci		if (wf && !filter_watch_notification(wf, n))
21962306a36Sopenharmony_ci			continue;
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci		if (security_post_notification(watch->cred, cred, n) < 0)
22262306a36Sopenharmony_ci			continue;
22362306a36Sopenharmony_ci
22462306a36Sopenharmony_ci		if (lock_wqueue(wqueue)) {
22562306a36Sopenharmony_ci			post_one_notification(wqueue, n);
22662306a36Sopenharmony_ci			unlock_wqueue(wqueue);
22762306a36Sopenharmony_ci		}
22862306a36Sopenharmony_ci	}
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ci	rcu_read_unlock();
23162306a36Sopenharmony_ci}
23262306a36Sopenharmony_ciEXPORT_SYMBOL(__post_watch_notification);
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci/*
23562306a36Sopenharmony_ci * Allocate sufficient pages to preallocation for the requested number of
23662306a36Sopenharmony_ci * notifications.
23762306a36Sopenharmony_ci */
23862306a36Sopenharmony_cilong watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
23962306a36Sopenharmony_ci{
24062306a36Sopenharmony_ci	struct watch_queue *wqueue = pipe->watch_queue;
24162306a36Sopenharmony_ci	struct page **pages;
24262306a36Sopenharmony_ci	unsigned long *bitmap;
24362306a36Sopenharmony_ci	unsigned long user_bufs;
24462306a36Sopenharmony_ci	int ret, i, nr_pages;
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci	if (!wqueue)
24762306a36Sopenharmony_ci		return -ENODEV;
24862306a36Sopenharmony_ci	if (wqueue->notes)
24962306a36Sopenharmony_ci		return -EBUSY;
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci	if (nr_notes < 1 ||
25262306a36Sopenharmony_ci	    nr_notes > 512) /* TODO: choose a better hard limit */
25362306a36Sopenharmony_ci		return -EINVAL;
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci	nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1);
25662306a36Sopenharmony_ci	nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE;
25762306a36Sopenharmony_ci	user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages);
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	if (nr_pages > pipe->max_usage &&
26062306a36Sopenharmony_ci	    (too_many_pipe_buffers_hard(user_bufs) ||
26162306a36Sopenharmony_ci	     too_many_pipe_buffers_soft(user_bufs)) &&
26262306a36Sopenharmony_ci	    pipe_is_unprivileged_user()) {
26362306a36Sopenharmony_ci		ret = -EPERM;
26462306a36Sopenharmony_ci		goto error;
26562306a36Sopenharmony_ci	}
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
26862306a36Sopenharmony_ci	ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes));
26962306a36Sopenharmony_ci	if (ret < 0)
27062306a36Sopenharmony_ci		goto error;
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	ret = -ENOMEM;
27362306a36Sopenharmony_ci	pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
27462306a36Sopenharmony_ci	if (!pages)
27562306a36Sopenharmony_ci		goto error;
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_ci	for (i = 0; i < nr_pages; i++) {
27862306a36Sopenharmony_ci		pages[i] = alloc_page(GFP_KERNEL);
27962306a36Sopenharmony_ci		if (!pages[i])
28062306a36Sopenharmony_ci			goto error_p;
28162306a36Sopenharmony_ci		pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
28262306a36Sopenharmony_ci	}
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci	bitmap = bitmap_alloc(nr_notes, GFP_KERNEL);
28562306a36Sopenharmony_ci	if (!bitmap)
28662306a36Sopenharmony_ci		goto error_p;
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci	bitmap_fill(bitmap, nr_notes);
28962306a36Sopenharmony_ci	wqueue->notes = pages;
29062306a36Sopenharmony_ci	wqueue->notes_bitmap = bitmap;
29162306a36Sopenharmony_ci	wqueue->nr_pages = nr_pages;
29262306a36Sopenharmony_ci	wqueue->nr_notes = nr_notes;
29362306a36Sopenharmony_ci	return 0;
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_cierror_p:
29662306a36Sopenharmony_ci	while (--i >= 0)
29762306a36Sopenharmony_ci		__free_page(pages[i]);
29862306a36Sopenharmony_ci	kfree(pages);
29962306a36Sopenharmony_cierror:
30062306a36Sopenharmony_ci	(void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted);
30162306a36Sopenharmony_ci	return ret;
30262306a36Sopenharmony_ci}
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_ci/*
30562306a36Sopenharmony_ci * Set the filter on a watch queue.
30662306a36Sopenharmony_ci */
30762306a36Sopenharmony_cilong watch_queue_set_filter(struct pipe_inode_info *pipe,
30862306a36Sopenharmony_ci			    struct watch_notification_filter __user *_filter)
30962306a36Sopenharmony_ci{
31062306a36Sopenharmony_ci	struct watch_notification_type_filter *tf;
31162306a36Sopenharmony_ci	struct watch_notification_filter filter;
31262306a36Sopenharmony_ci	struct watch_type_filter *q;
31362306a36Sopenharmony_ci	struct watch_filter *wfilter;
31462306a36Sopenharmony_ci	struct watch_queue *wqueue = pipe->watch_queue;
31562306a36Sopenharmony_ci	int ret, nr_filter = 0, i;
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_ci	if (!wqueue)
31862306a36Sopenharmony_ci		return -ENODEV;
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci	if (!_filter) {
32162306a36Sopenharmony_ci		/* Remove the old filter */
32262306a36Sopenharmony_ci		wfilter = NULL;
32362306a36Sopenharmony_ci		goto set;
32462306a36Sopenharmony_ci	}
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci	/* Grab the user's filter specification */
32762306a36Sopenharmony_ci	if (copy_from_user(&filter, _filter, sizeof(filter)) != 0)
32862306a36Sopenharmony_ci		return -EFAULT;
32962306a36Sopenharmony_ci	if (filter.nr_filters == 0 ||
33062306a36Sopenharmony_ci	    filter.nr_filters > 16 ||
33162306a36Sopenharmony_ci	    filter.__reserved != 0)
33262306a36Sopenharmony_ci		return -EINVAL;
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ci	tf = memdup_array_user(_filter->filters, filter.nr_filters, sizeof(*tf));
33562306a36Sopenharmony_ci	if (IS_ERR(tf))
33662306a36Sopenharmony_ci		return PTR_ERR(tf);
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	ret = -EINVAL;
33962306a36Sopenharmony_ci	for (i = 0; i < filter.nr_filters; i++) {
34062306a36Sopenharmony_ci		if ((tf[i].info_filter & ~tf[i].info_mask) ||
34162306a36Sopenharmony_ci		    tf[i].info_mask & WATCH_INFO_LENGTH)
34262306a36Sopenharmony_ci			goto err_filter;
34362306a36Sopenharmony_ci		/* Ignore any unknown types */
34462306a36Sopenharmony_ci		if (tf[i].type >= WATCH_TYPE__NR)
34562306a36Sopenharmony_ci			continue;
34662306a36Sopenharmony_ci		nr_filter++;
34762306a36Sopenharmony_ci	}
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci	/* Now we need to build the internal filter from only the relevant
35062306a36Sopenharmony_ci	 * user-specified filters.
35162306a36Sopenharmony_ci	 */
35262306a36Sopenharmony_ci	ret = -ENOMEM;
35362306a36Sopenharmony_ci	wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL);
35462306a36Sopenharmony_ci	if (!wfilter)
35562306a36Sopenharmony_ci		goto err_filter;
35662306a36Sopenharmony_ci	wfilter->nr_filters = nr_filter;
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_ci	q = wfilter->filters;
35962306a36Sopenharmony_ci	for (i = 0; i < filter.nr_filters; i++) {
36062306a36Sopenharmony_ci		if (tf[i].type >= WATCH_TYPE__NR)
36162306a36Sopenharmony_ci			continue;
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci		q->type			= tf[i].type;
36462306a36Sopenharmony_ci		q->info_filter		= tf[i].info_filter;
36562306a36Sopenharmony_ci		q->info_mask		= tf[i].info_mask;
36662306a36Sopenharmony_ci		q->subtype_filter[0]	= tf[i].subtype_filter[0];
36762306a36Sopenharmony_ci		__set_bit(q->type, wfilter->type_filter);
36862306a36Sopenharmony_ci		q++;
36962306a36Sopenharmony_ci	}
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_ci	kfree(tf);
37262306a36Sopenharmony_ciset:
37362306a36Sopenharmony_ci	pipe_lock(pipe);
37462306a36Sopenharmony_ci	wfilter = rcu_replace_pointer(wqueue->filter, wfilter,
37562306a36Sopenharmony_ci				      lockdep_is_held(&pipe->mutex));
37662306a36Sopenharmony_ci	pipe_unlock(pipe);
37762306a36Sopenharmony_ci	if (wfilter)
37862306a36Sopenharmony_ci		kfree_rcu(wfilter, rcu);
37962306a36Sopenharmony_ci	return 0;
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_cierr_filter:
38262306a36Sopenharmony_ci	kfree(tf);
38362306a36Sopenharmony_ci	return ret;
38462306a36Sopenharmony_ci}
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_cistatic void __put_watch_queue(struct kref *kref)
38762306a36Sopenharmony_ci{
38862306a36Sopenharmony_ci	struct watch_queue *wqueue =
38962306a36Sopenharmony_ci		container_of(kref, struct watch_queue, usage);
39062306a36Sopenharmony_ci	struct watch_filter *wfilter;
39162306a36Sopenharmony_ci	int i;
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci	for (i = 0; i < wqueue->nr_pages; i++)
39462306a36Sopenharmony_ci		__free_page(wqueue->notes[i]);
39562306a36Sopenharmony_ci	kfree(wqueue->notes);
39662306a36Sopenharmony_ci	bitmap_free(wqueue->notes_bitmap);
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_ci	wfilter = rcu_access_pointer(wqueue->filter);
39962306a36Sopenharmony_ci	if (wfilter)
40062306a36Sopenharmony_ci		kfree_rcu(wfilter, rcu);
40162306a36Sopenharmony_ci	kfree_rcu(wqueue, rcu);
40262306a36Sopenharmony_ci}
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci/**
40562306a36Sopenharmony_ci * put_watch_queue - Dispose of a ref on a watchqueue.
40662306a36Sopenharmony_ci * @wqueue: The watch queue to unref.
40762306a36Sopenharmony_ci */
40862306a36Sopenharmony_civoid put_watch_queue(struct watch_queue *wqueue)
40962306a36Sopenharmony_ci{
41062306a36Sopenharmony_ci	kref_put(&wqueue->usage, __put_watch_queue);
41162306a36Sopenharmony_ci}
41262306a36Sopenharmony_ciEXPORT_SYMBOL(put_watch_queue);
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_cistatic void free_watch(struct rcu_head *rcu)
41562306a36Sopenharmony_ci{
41662306a36Sopenharmony_ci	struct watch *watch = container_of(rcu, struct watch, rcu);
41762306a36Sopenharmony_ci
41862306a36Sopenharmony_ci	put_watch_queue(rcu_access_pointer(watch->queue));
41962306a36Sopenharmony_ci	atomic_dec(&watch->cred->user->nr_watches);
42062306a36Sopenharmony_ci	put_cred(watch->cred);
42162306a36Sopenharmony_ci	kfree(watch);
42262306a36Sopenharmony_ci}
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_cistatic void __put_watch(struct kref *kref)
42562306a36Sopenharmony_ci{
42662306a36Sopenharmony_ci	struct watch *watch = container_of(kref, struct watch, usage);
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci	call_rcu(&watch->rcu, free_watch);
42962306a36Sopenharmony_ci}
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci/*
43262306a36Sopenharmony_ci * Discard a watch.
43362306a36Sopenharmony_ci */
43462306a36Sopenharmony_cistatic void put_watch(struct watch *watch)
43562306a36Sopenharmony_ci{
43662306a36Sopenharmony_ci	kref_put(&watch->usage, __put_watch);
43762306a36Sopenharmony_ci}
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci/**
44062306a36Sopenharmony_ci * init_watch - Initialise a watch
44162306a36Sopenharmony_ci * @watch: The watch to initialise.
44262306a36Sopenharmony_ci * @wqueue: The queue to assign.
44362306a36Sopenharmony_ci *
44462306a36Sopenharmony_ci * Initialise a watch and set the watch queue.
44562306a36Sopenharmony_ci */
44662306a36Sopenharmony_civoid init_watch(struct watch *watch, struct watch_queue *wqueue)
44762306a36Sopenharmony_ci{
44862306a36Sopenharmony_ci	kref_init(&watch->usage);
44962306a36Sopenharmony_ci	INIT_HLIST_NODE(&watch->list_node);
45062306a36Sopenharmony_ci	INIT_HLIST_NODE(&watch->queue_node);
45162306a36Sopenharmony_ci	rcu_assign_pointer(watch->queue, wqueue);
45262306a36Sopenharmony_ci}
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_cistatic int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue)
45562306a36Sopenharmony_ci{
45662306a36Sopenharmony_ci	const struct cred *cred;
45762306a36Sopenharmony_ci	struct watch *w;
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci	hlist_for_each_entry(w, &wlist->watchers, list_node) {
46062306a36Sopenharmony_ci		struct watch_queue *wq = rcu_access_pointer(w->queue);
46162306a36Sopenharmony_ci		if (wqueue == wq && watch->id == w->id)
46262306a36Sopenharmony_ci			return -EBUSY;
46362306a36Sopenharmony_ci	}
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci	cred = current_cred();
46662306a36Sopenharmony_ci	if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) {
46762306a36Sopenharmony_ci		atomic_dec(&cred->user->nr_watches);
46862306a36Sopenharmony_ci		return -EAGAIN;
46962306a36Sopenharmony_ci	}
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci	watch->cred = get_cred(cred);
47262306a36Sopenharmony_ci	rcu_assign_pointer(watch->watch_list, wlist);
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_ci	kref_get(&wqueue->usage);
47562306a36Sopenharmony_ci	kref_get(&watch->usage);
47662306a36Sopenharmony_ci	hlist_add_head(&watch->queue_node, &wqueue->watches);
47762306a36Sopenharmony_ci	hlist_add_head_rcu(&watch->list_node, &wlist->watchers);
47862306a36Sopenharmony_ci	return 0;
47962306a36Sopenharmony_ci}
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci/**
48262306a36Sopenharmony_ci * add_watch_to_object - Add a watch on an object to a watch list
48362306a36Sopenharmony_ci * @watch: The watch to add
48462306a36Sopenharmony_ci * @wlist: The watch list to add to
48562306a36Sopenharmony_ci *
48662306a36Sopenharmony_ci * @watch->queue must have been set to point to the queue to post notifications
48762306a36Sopenharmony_ci * to and the watch list of the object to be watched.  @watch->cred must also
48862306a36Sopenharmony_ci * have been set to the appropriate credentials and a ref taken on them.
48962306a36Sopenharmony_ci *
49062306a36Sopenharmony_ci * The caller must pin the queue and the list both and must hold the list
49162306a36Sopenharmony_ci * locked against racing watch additions/removals.
49262306a36Sopenharmony_ci */
49362306a36Sopenharmony_ciint add_watch_to_object(struct watch *watch, struct watch_list *wlist)
49462306a36Sopenharmony_ci{
49562306a36Sopenharmony_ci	struct watch_queue *wqueue;
49662306a36Sopenharmony_ci	int ret = -ENOENT;
49762306a36Sopenharmony_ci
49862306a36Sopenharmony_ci	rcu_read_lock();
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	wqueue = rcu_access_pointer(watch->queue);
50162306a36Sopenharmony_ci	if (lock_wqueue(wqueue)) {
50262306a36Sopenharmony_ci		spin_lock(&wlist->lock);
50362306a36Sopenharmony_ci		ret = add_one_watch(watch, wlist, wqueue);
50462306a36Sopenharmony_ci		spin_unlock(&wlist->lock);
50562306a36Sopenharmony_ci		unlock_wqueue(wqueue);
50662306a36Sopenharmony_ci	}
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	rcu_read_unlock();
50962306a36Sopenharmony_ci	return ret;
51062306a36Sopenharmony_ci}
51162306a36Sopenharmony_ciEXPORT_SYMBOL(add_watch_to_object);
51262306a36Sopenharmony_ci
51362306a36Sopenharmony_ci/**
51462306a36Sopenharmony_ci * remove_watch_from_object - Remove a watch or all watches from an object.
51562306a36Sopenharmony_ci * @wlist: The watch list to remove from
51662306a36Sopenharmony_ci * @wq: The watch queue of interest (ignored if @all is true)
51762306a36Sopenharmony_ci * @id: The ID of the watch to remove (ignored if @all is true)
51862306a36Sopenharmony_ci * @all: True to remove all objects
51962306a36Sopenharmony_ci *
52062306a36Sopenharmony_ci * Remove a specific watch or all watches from an object.  A notification is
52162306a36Sopenharmony_ci * sent to the watcher to tell them that this happened.
52262306a36Sopenharmony_ci */
52362306a36Sopenharmony_ciint remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq,
52462306a36Sopenharmony_ci			     u64 id, bool all)
52562306a36Sopenharmony_ci{
52662306a36Sopenharmony_ci	struct watch_notification_removal n;
52762306a36Sopenharmony_ci	struct watch_queue *wqueue;
52862306a36Sopenharmony_ci	struct watch *watch;
52962306a36Sopenharmony_ci	int ret = -EBADSLT;
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci	rcu_read_lock();
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ciagain:
53462306a36Sopenharmony_ci	spin_lock(&wlist->lock);
53562306a36Sopenharmony_ci	hlist_for_each_entry(watch, &wlist->watchers, list_node) {
53662306a36Sopenharmony_ci		if (all ||
53762306a36Sopenharmony_ci		    (watch->id == id && rcu_access_pointer(watch->queue) == wq))
53862306a36Sopenharmony_ci			goto found;
53962306a36Sopenharmony_ci	}
54062306a36Sopenharmony_ci	spin_unlock(&wlist->lock);
54162306a36Sopenharmony_ci	goto out;
54262306a36Sopenharmony_ci
54362306a36Sopenharmony_cifound:
54462306a36Sopenharmony_ci	ret = 0;
54562306a36Sopenharmony_ci	hlist_del_init_rcu(&watch->list_node);
54662306a36Sopenharmony_ci	rcu_assign_pointer(watch->watch_list, NULL);
54762306a36Sopenharmony_ci	spin_unlock(&wlist->lock);
54862306a36Sopenharmony_ci
54962306a36Sopenharmony_ci	/* We now own the reference on watch that used to belong to wlist. */
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ci	n.watch.type = WATCH_TYPE_META;
55262306a36Sopenharmony_ci	n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION;
55362306a36Sopenharmony_ci	n.watch.info = watch->info_id | watch_sizeof(n.watch);
55462306a36Sopenharmony_ci	n.id = id;
55562306a36Sopenharmony_ci	if (id != 0)
55662306a36Sopenharmony_ci		n.watch.info = watch->info_id | watch_sizeof(n);
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci	wqueue = rcu_dereference(watch->queue);
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_ci	if (lock_wqueue(wqueue)) {
56162306a36Sopenharmony_ci		post_one_notification(wqueue, &n.watch);
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci		if (!hlist_unhashed(&watch->queue_node)) {
56462306a36Sopenharmony_ci			hlist_del_init_rcu(&watch->queue_node);
56562306a36Sopenharmony_ci			put_watch(watch);
56662306a36Sopenharmony_ci		}
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci		unlock_wqueue(wqueue);
56962306a36Sopenharmony_ci	}
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ci	if (wlist->release_watch) {
57262306a36Sopenharmony_ci		void (*release_watch)(struct watch *);
57362306a36Sopenharmony_ci
57462306a36Sopenharmony_ci		release_watch = wlist->release_watch;
57562306a36Sopenharmony_ci		rcu_read_unlock();
57662306a36Sopenharmony_ci		(*release_watch)(watch);
57762306a36Sopenharmony_ci		rcu_read_lock();
57862306a36Sopenharmony_ci	}
57962306a36Sopenharmony_ci	put_watch(watch);
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	if (all && !hlist_empty(&wlist->watchers))
58262306a36Sopenharmony_ci		goto again;
58362306a36Sopenharmony_ciout:
58462306a36Sopenharmony_ci	rcu_read_unlock();
58562306a36Sopenharmony_ci	return ret;
58662306a36Sopenharmony_ci}
58762306a36Sopenharmony_ciEXPORT_SYMBOL(remove_watch_from_object);
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci/*
59062306a36Sopenharmony_ci * Remove all the watches that are contributory to a queue.  This has the
59162306a36Sopenharmony_ci * potential to race with removal of the watches by the destruction of the
59262306a36Sopenharmony_ci * objects being watched or with the distribution of notifications.
59362306a36Sopenharmony_ci */
59462306a36Sopenharmony_civoid watch_queue_clear(struct watch_queue *wqueue)
59562306a36Sopenharmony_ci{
59662306a36Sopenharmony_ci	struct watch_list *wlist;
59762306a36Sopenharmony_ci	struct watch *watch;
59862306a36Sopenharmony_ci	bool release;
59962306a36Sopenharmony_ci
60062306a36Sopenharmony_ci	rcu_read_lock();
60162306a36Sopenharmony_ci	spin_lock_bh(&wqueue->lock);
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci	/*
60462306a36Sopenharmony_ci	 * This pipe can be freed by callers like free_pipe_info().
60562306a36Sopenharmony_ci	 * Removing this reference also prevents new notifications.
60662306a36Sopenharmony_ci	 */
60762306a36Sopenharmony_ci	wqueue->pipe = NULL;
60862306a36Sopenharmony_ci
60962306a36Sopenharmony_ci	while (!hlist_empty(&wqueue->watches)) {
61062306a36Sopenharmony_ci		watch = hlist_entry(wqueue->watches.first, struct watch, queue_node);
61162306a36Sopenharmony_ci		hlist_del_init_rcu(&watch->queue_node);
61262306a36Sopenharmony_ci		/* We now own a ref on the watch. */
61362306a36Sopenharmony_ci		spin_unlock_bh(&wqueue->lock);
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci		/* We can't do the next bit under the queue lock as we need to
61662306a36Sopenharmony_ci		 * get the list lock - which would cause a deadlock if someone
61762306a36Sopenharmony_ci		 * was removing from the opposite direction at the same time or
61862306a36Sopenharmony_ci		 * posting a notification.
61962306a36Sopenharmony_ci		 */
62062306a36Sopenharmony_ci		wlist = rcu_dereference(watch->watch_list);
62162306a36Sopenharmony_ci		if (wlist) {
62262306a36Sopenharmony_ci			void (*release_watch)(struct watch *);
62362306a36Sopenharmony_ci
62462306a36Sopenharmony_ci			spin_lock(&wlist->lock);
62562306a36Sopenharmony_ci
62662306a36Sopenharmony_ci			release = !hlist_unhashed(&watch->list_node);
62762306a36Sopenharmony_ci			if (release) {
62862306a36Sopenharmony_ci				hlist_del_init_rcu(&watch->list_node);
62962306a36Sopenharmony_ci				rcu_assign_pointer(watch->watch_list, NULL);
63062306a36Sopenharmony_ci
63162306a36Sopenharmony_ci				/* We now own a second ref on the watch. */
63262306a36Sopenharmony_ci			}
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci			release_watch = wlist->release_watch;
63562306a36Sopenharmony_ci			spin_unlock(&wlist->lock);
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ci			if (release) {
63862306a36Sopenharmony_ci				if (release_watch) {
63962306a36Sopenharmony_ci					rcu_read_unlock();
64062306a36Sopenharmony_ci					/* This might need to call dput(), so
64162306a36Sopenharmony_ci					 * we have to drop all the locks.
64262306a36Sopenharmony_ci					 */
64362306a36Sopenharmony_ci					(*release_watch)(watch);
64462306a36Sopenharmony_ci					rcu_read_lock();
64562306a36Sopenharmony_ci				}
64662306a36Sopenharmony_ci				put_watch(watch);
64762306a36Sopenharmony_ci			}
64862306a36Sopenharmony_ci		}
64962306a36Sopenharmony_ci
65062306a36Sopenharmony_ci		put_watch(watch);
65162306a36Sopenharmony_ci		spin_lock_bh(&wqueue->lock);
65262306a36Sopenharmony_ci	}
65362306a36Sopenharmony_ci
65462306a36Sopenharmony_ci	spin_unlock_bh(&wqueue->lock);
65562306a36Sopenharmony_ci	rcu_read_unlock();
65662306a36Sopenharmony_ci}
65762306a36Sopenharmony_ci
65862306a36Sopenharmony_ci/**
65962306a36Sopenharmony_ci * get_watch_queue - Get a watch queue from its file descriptor.
66062306a36Sopenharmony_ci * @fd: The fd to query.
66162306a36Sopenharmony_ci */
66262306a36Sopenharmony_cistruct watch_queue *get_watch_queue(int fd)
66362306a36Sopenharmony_ci{
66462306a36Sopenharmony_ci	struct pipe_inode_info *pipe;
66562306a36Sopenharmony_ci	struct watch_queue *wqueue = ERR_PTR(-EINVAL);
66662306a36Sopenharmony_ci	struct fd f;
66762306a36Sopenharmony_ci
66862306a36Sopenharmony_ci	f = fdget(fd);
66962306a36Sopenharmony_ci	if (f.file) {
67062306a36Sopenharmony_ci		pipe = get_pipe_info(f.file, false);
67162306a36Sopenharmony_ci		if (pipe && pipe->watch_queue) {
67262306a36Sopenharmony_ci			wqueue = pipe->watch_queue;
67362306a36Sopenharmony_ci			kref_get(&wqueue->usage);
67462306a36Sopenharmony_ci		}
67562306a36Sopenharmony_ci		fdput(f);
67662306a36Sopenharmony_ci	}
67762306a36Sopenharmony_ci
67862306a36Sopenharmony_ci	return wqueue;
67962306a36Sopenharmony_ci}
68062306a36Sopenharmony_ciEXPORT_SYMBOL(get_watch_queue);
68162306a36Sopenharmony_ci
68262306a36Sopenharmony_ci/*
68362306a36Sopenharmony_ci * Initialise a watch queue
68462306a36Sopenharmony_ci */
68562306a36Sopenharmony_ciint watch_queue_init(struct pipe_inode_info *pipe)
68662306a36Sopenharmony_ci{
68762306a36Sopenharmony_ci	struct watch_queue *wqueue;
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci	wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL);
69062306a36Sopenharmony_ci	if (!wqueue)
69162306a36Sopenharmony_ci		return -ENOMEM;
69262306a36Sopenharmony_ci
69362306a36Sopenharmony_ci	wqueue->pipe = pipe;
69462306a36Sopenharmony_ci	kref_init(&wqueue->usage);
69562306a36Sopenharmony_ci	spin_lock_init(&wqueue->lock);
69662306a36Sopenharmony_ci	INIT_HLIST_HEAD(&wqueue->watches);
69762306a36Sopenharmony_ci
69862306a36Sopenharmony_ci	pipe->watch_queue = wqueue;
69962306a36Sopenharmony_ci	return 0;
70062306a36Sopenharmony_ci}
701