162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci *	An async IO implementation for Linux
362306a36Sopenharmony_ci *	Written by Benjamin LaHaise <bcrl@kvack.org>
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci *	Implements an efficient asynchronous io interface.
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
862306a36Sopenharmony_ci *	Copyright 2018 Christoph Hellwig.
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci *	See ../COPYING for licensing terms.
1162306a36Sopenharmony_ci */
1262306a36Sopenharmony_ci#define pr_fmt(fmt) "%s: " fmt, __func__
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci#include <linux/kernel.h>
1562306a36Sopenharmony_ci#include <linux/init.h>
1662306a36Sopenharmony_ci#include <linux/errno.h>
1762306a36Sopenharmony_ci#include <linux/time.h>
1862306a36Sopenharmony_ci#include <linux/aio_abi.h>
1962306a36Sopenharmony_ci#include <linux/export.h>
2062306a36Sopenharmony_ci#include <linux/syscalls.h>
2162306a36Sopenharmony_ci#include <linux/backing-dev.h>
2262306a36Sopenharmony_ci#include <linux/refcount.h>
2362306a36Sopenharmony_ci#include <linux/uio.h>
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_ci#include <linux/sched/signal.h>
2662306a36Sopenharmony_ci#include <linux/fs.h>
2762306a36Sopenharmony_ci#include <linux/file.h>
2862306a36Sopenharmony_ci#include <linux/mm.h>
2962306a36Sopenharmony_ci#include <linux/mman.h>
3062306a36Sopenharmony_ci#include <linux/percpu.h>
3162306a36Sopenharmony_ci#include <linux/slab.h>
3262306a36Sopenharmony_ci#include <linux/timer.h>
3362306a36Sopenharmony_ci#include <linux/aio.h>
3462306a36Sopenharmony_ci#include <linux/highmem.h>
3562306a36Sopenharmony_ci#include <linux/workqueue.h>
3662306a36Sopenharmony_ci#include <linux/security.h>
3762306a36Sopenharmony_ci#include <linux/eventfd.h>
3862306a36Sopenharmony_ci#include <linux/blkdev.h>
3962306a36Sopenharmony_ci#include <linux/compat.h>
4062306a36Sopenharmony_ci#include <linux/migrate.h>
4162306a36Sopenharmony_ci#include <linux/ramfs.h>
4262306a36Sopenharmony_ci#include <linux/percpu-refcount.h>
4362306a36Sopenharmony_ci#include <linux/mount.h>
4462306a36Sopenharmony_ci#include <linux/pseudo_fs.h>
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci#include <linux/uaccess.h>
4762306a36Sopenharmony_ci#include <linux/nospec.h>
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci#include "internal.h"
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci#define KIOCB_KEY		0
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci#define AIO_RING_MAGIC			0xa10a10a1
5462306a36Sopenharmony_ci#define AIO_RING_COMPAT_FEATURES	1
5562306a36Sopenharmony_ci#define AIO_RING_INCOMPAT_FEATURES	0
5662306a36Sopenharmony_cistruct aio_ring {
5762306a36Sopenharmony_ci	unsigned	id;	/* kernel internal index number */
5862306a36Sopenharmony_ci	unsigned	nr;	/* number of io_events */
5962306a36Sopenharmony_ci	unsigned	head;	/* Written to by userland or under ring_lock
6062306a36Sopenharmony_ci				 * mutex by aio_read_events_ring(). */
6162306a36Sopenharmony_ci	unsigned	tail;
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	unsigned	magic;
6462306a36Sopenharmony_ci	unsigned	compat_features;
6562306a36Sopenharmony_ci	unsigned	incompat_features;
6662306a36Sopenharmony_ci	unsigned	header_length;	/* size of aio_ring */
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci	struct io_event		io_events[];
7062306a36Sopenharmony_ci}; /* 128 bytes + ring size */
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci/*
7362306a36Sopenharmony_ci * Plugging is meant to work with larger batches of IOs. If we don't
7462306a36Sopenharmony_ci * have more than the below, then don't bother setting up a plug.
7562306a36Sopenharmony_ci */
7662306a36Sopenharmony_ci#define AIO_PLUG_THRESHOLD	2
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci#define AIO_RING_PAGES	8
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_cistruct kioctx_table {
8162306a36Sopenharmony_ci	struct rcu_head		rcu;
8262306a36Sopenharmony_ci	unsigned		nr;
8362306a36Sopenharmony_ci	struct kioctx __rcu	*table[] __counted_by(nr);
8462306a36Sopenharmony_ci};
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_cistruct kioctx_cpu {
8762306a36Sopenharmony_ci	unsigned		reqs_available;
8862306a36Sopenharmony_ci};
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_cistruct ctx_rq_wait {
9162306a36Sopenharmony_ci	struct completion comp;
9262306a36Sopenharmony_ci	atomic_t count;
9362306a36Sopenharmony_ci};
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_cistruct kioctx {
9662306a36Sopenharmony_ci	struct percpu_ref	users;
9762306a36Sopenharmony_ci	atomic_t		dead;
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci	struct percpu_ref	reqs;
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci	unsigned long		user_id;
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_ci	struct __percpu kioctx_cpu *cpu;
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	/*
10662306a36Sopenharmony_ci	 * For percpu reqs_available, number of slots we move to/from global
10762306a36Sopenharmony_ci	 * counter at a time:
10862306a36Sopenharmony_ci	 */
10962306a36Sopenharmony_ci	unsigned		req_batch;
11062306a36Sopenharmony_ci	/*
11162306a36Sopenharmony_ci	 * This is what userspace passed to io_setup(), it's not used for
11262306a36Sopenharmony_ci	 * anything but counting against the global max_reqs quota.
11362306a36Sopenharmony_ci	 *
11462306a36Sopenharmony_ci	 * The real limit is nr_events - 1, which will be larger (see
11562306a36Sopenharmony_ci	 * aio_setup_ring())
11662306a36Sopenharmony_ci	 */
11762306a36Sopenharmony_ci	unsigned		max_reqs;
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci	/* Size of ringbuffer, in units of struct io_event */
12062306a36Sopenharmony_ci	unsigned		nr_events;
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci	unsigned long		mmap_base;
12362306a36Sopenharmony_ci	unsigned long		mmap_size;
12462306a36Sopenharmony_ci
12562306a36Sopenharmony_ci	struct page		**ring_pages;
12662306a36Sopenharmony_ci	long			nr_pages;
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	struct rcu_work		free_rwork;	/* see free_ioctx() */
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci	/*
13162306a36Sopenharmony_ci	 * signals when all in-flight requests are done
13262306a36Sopenharmony_ci	 */
13362306a36Sopenharmony_ci	struct ctx_rq_wait	*rq_wait;
13462306a36Sopenharmony_ci
13562306a36Sopenharmony_ci	struct {
13662306a36Sopenharmony_ci		/*
13762306a36Sopenharmony_ci		 * This counts the number of available slots in the ringbuffer,
13862306a36Sopenharmony_ci		 * so we avoid overflowing it: it's decremented (if positive)
13962306a36Sopenharmony_ci		 * when allocating a kiocb and incremented when the resulting
14062306a36Sopenharmony_ci		 * io_event is pulled off the ringbuffer.
14162306a36Sopenharmony_ci		 *
14262306a36Sopenharmony_ci		 * We batch accesses to it with a percpu version.
14362306a36Sopenharmony_ci		 */
14462306a36Sopenharmony_ci		atomic_t	reqs_available;
14562306a36Sopenharmony_ci	} ____cacheline_aligned_in_smp;
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	struct {
14862306a36Sopenharmony_ci		spinlock_t	ctx_lock;
14962306a36Sopenharmony_ci		struct list_head active_reqs;	/* used for cancellation */
15062306a36Sopenharmony_ci	} ____cacheline_aligned_in_smp;
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci	struct {
15362306a36Sopenharmony_ci		struct mutex	ring_lock;
15462306a36Sopenharmony_ci		wait_queue_head_t wait;
15562306a36Sopenharmony_ci	} ____cacheline_aligned_in_smp;
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci	struct {
15862306a36Sopenharmony_ci		unsigned	tail;
15962306a36Sopenharmony_ci		unsigned	completed_events;
16062306a36Sopenharmony_ci		spinlock_t	completion_lock;
16162306a36Sopenharmony_ci	} ____cacheline_aligned_in_smp;
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci	struct page		*internal_pages[AIO_RING_PAGES];
16462306a36Sopenharmony_ci	struct file		*aio_ring_file;
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci	unsigned		id;
16762306a36Sopenharmony_ci};
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci/*
17062306a36Sopenharmony_ci * First field must be the file pointer in all the
17162306a36Sopenharmony_ci * iocb unions! See also 'struct kiocb' in <linux/fs.h>
17262306a36Sopenharmony_ci */
17362306a36Sopenharmony_cistruct fsync_iocb {
17462306a36Sopenharmony_ci	struct file		*file;
17562306a36Sopenharmony_ci	struct work_struct	work;
17662306a36Sopenharmony_ci	bool			datasync;
17762306a36Sopenharmony_ci	struct cred		*creds;
17862306a36Sopenharmony_ci};
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_cistruct poll_iocb {
18162306a36Sopenharmony_ci	struct file		*file;
18262306a36Sopenharmony_ci	struct wait_queue_head	*head;
18362306a36Sopenharmony_ci	__poll_t		events;
18462306a36Sopenharmony_ci	bool			cancelled;
18562306a36Sopenharmony_ci	bool			work_scheduled;
18662306a36Sopenharmony_ci	bool			work_need_resched;
18762306a36Sopenharmony_ci	struct wait_queue_entry	wait;
18862306a36Sopenharmony_ci	struct work_struct	work;
18962306a36Sopenharmony_ci};
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci/*
19262306a36Sopenharmony_ci * NOTE! Each of the iocb union members has the file pointer
19362306a36Sopenharmony_ci * as the first entry in their struct definition. So you can
19462306a36Sopenharmony_ci * access the file pointer through any of the sub-structs,
19562306a36Sopenharmony_ci * or directly as just 'ki_filp' in this struct.
19662306a36Sopenharmony_ci */
19762306a36Sopenharmony_cistruct aio_kiocb {
19862306a36Sopenharmony_ci	union {
19962306a36Sopenharmony_ci		struct file		*ki_filp;
20062306a36Sopenharmony_ci		struct kiocb		rw;
20162306a36Sopenharmony_ci		struct fsync_iocb	fsync;
20262306a36Sopenharmony_ci		struct poll_iocb	poll;
20362306a36Sopenharmony_ci	};
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci	struct kioctx		*ki_ctx;
20662306a36Sopenharmony_ci	kiocb_cancel_fn		*ki_cancel;
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci	struct io_event		ki_res;
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	struct list_head	ki_list;	/* the aio core uses this
21162306a36Sopenharmony_ci						 * for cancellation */
21262306a36Sopenharmony_ci	refcount_t		ki_refcnt;
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci	/*
21562306a36Sopenharmony_ci	 * If the aio_resfd field of the userspace iocb is not zero,
21662306a36Sopenharmony_ci	 * this is the underlying eventfd context to deliver events to.
21762306a36Sopenharmony_ci	 */
21862306a36Sopenharmony_ci	struct eventfd_ctx	*ki_eventfd;
21962306a36Sopenharmony_ci};
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci/*------ sysctl variables----*/
22262306a36Sopenharmony_cistatic DEFINE_SPINLOCK(aio_nr_lock);
22362306a36Sopenharmony_cistatic unsigned long aio_nr;		/* current system wide number of aio requests */
22462306a36Sopenharmony_cistatic unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
22562306a36Sopenharmony_ci/*----end sysctl variables---*/
22662306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL
22762306a36Sopenharmony_cistatic struct ctl_table aio_sysctls[] = {
22862306a36Sopenharmony_ci	{
22962306a36Sopenharmony_ci		.procname	= "aio-nr",
23062306a36Sopenharmony_ci		.data		= &aio_nr,
23162306a36Sopenharmony_ci		.maxlen		= sizeof(aio_nr),
23262306a36Sopenharmony_ci		.mode		= 0444,
23362306a36Sopenharmony_ci		.proc_handler	= proc_doulongvec_minmax,
23462306a36Sopenharmony_ci	},
23562306a36Sopenharmony_ci	{
23662306a36Sopenharmony_ci		.procname	= "aio-max-nr",
23762306a36Sopenharmony_ci		.data		= &aio_max_nr,
23862306a36Sopenharmony_ci		.maxlen		= sizeof(aio_max_nr),
23962306a36Sopenharmony_ci		.mode		= 0644,
24062306a36Sopenharmony_ci		.proc_handler	= proc_doulongvec_minmax,
24162306a36Sopenharmony_ci	},
24262306a36Sopenharmony_ci	{}
24362306a36Sopenharmony_ci};
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_cistatic void __init aio_sysctl_init(void)
24662306a36Sopenharmony_ci{
24762306a36Sopenharmony_ci	register_sysctl_init("fs", aio_sysctls);
24862306a36Sopenharmony_ci}
24962306a36Sopenharmony_ci#else
25062306a36Sopenharmony_ci#define aio_sysctl_init() do { } while (0)
25162306a36Sopenharmony_ci#endif
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_cistatic struct kmem_cache	*kiocb_cachep;
25462306a36Sopenharmony_cistatic struct kmem_cache	*kioctx_cachep;
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_cistatic struct vfsmount *aio_mnt;
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_cistatic const struct file_operations aio_ring_fops;
25962306a36Sopenharmony_cistatic const struct address_space_operations aio_ctx_aops;
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_cistatic struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
26262306a36Sopenharmony_ci{
26362306a36Sopenharmony_ci	struct file *file;
26462306a36Sopenharmony_ci	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
26562306a36Sopenharmony_ci	if (IS_ERR(inode))
26662306a36Sopenharmony_ci		return ERR_CAST(inode);
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	inode->i_mapping->a_ops = &aio_ctx_aops;
26962306a36Sopenharmony_ci	inode->i_mapping->private_data = ctx;
27062306a36Sopenharmony_ci	inode->i_size = PAGE_SIZE * nr_pages;
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	file = alloc_file_pseudo(inode, aio_mnt, "[aio]",
27362306a36Sopenharmony_ci				O_RDWR, &aio_ring_fops);
27462306a36Sopenharmony_ci	if (IS_ERR(file))
27562306a36Sopenharmony_ci		iput(inode);
27662306a36Sopenharmony_ci	return file;
27762306a36Sopenharmony_ci}
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_cistatic int aio_init_fs_context(struct fs_context *fc)
28062306a36Sopenharmony_ci{
28162306a36Sopenharmony_ci	if (!init_pseudo(fc, AIO_RING_MAGIC))
28262306a36Sopenharmony_ci		return -ENOMEM;
28362306a36Sopenharmony_ci	fc->s_iflags |= SB_I_NOEXEC;
28462306a36Sopenharmony_ci	return 0;
28562306a36Sopenharmony_ci}
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci/* aio_setup
28862306a36Sopenharmony_ci *	Creates the slab caches used by the aio routines, panic on
28962306a36Sopenharmony_ci *	failure as this is done early during the boot sequence.
29062306a36Sopenharmony_ci */
29162306a36Sopenharmony_cistatic int __init aio_setup(void)
29262306a36Sopenharmony_ci{
29362306a36Sopenharmony_ci	static struct file_system_type aio_fs = {
29462306a36Sopenharmony_ci		.name		= "aio",
29562306a36Sopenharmony_ci		.init_fs_context = aio_init_fs_context,
29662306a36Sopenharmony_ci		.kill_sb	= kill_anon_super,
29762306a36Sopenharmony_ci	};
29862306a36Sopenharmony_ci	aio_mnt = kern_mount(&aio_fs);
29962306a36Sopenharmony_ci	if (IS_ERR(aio_mnt))
30062306a36Sopenharmony_ci		panic("Failed to create aio fs mount.");
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
30362306a36Sopenharmony_ci	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
30462306a36Sopenharmony_ci	aio_sysctl_init();
30562306a36Sopenharmony_ci	return 0;
30662306a36Sopenharmony_ci}
30762306a36Sopenharmony_ci__initcall(aio_setup);
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_cistatic void put_aio_ring_file(struct kioctx *ctx)
31062306a36Sopenharmony_ci{
31162306a36Sopenharmony_ci	struct file *aio_ring_file = ctx->aio_ring_file;
31262306a36Sopenharmony_ci	struct address_space *i_mapping;
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci	if (aio_ring_file) {
31562306a36Sopenharmony_ci		truncate_setsize(file_inode(aio_ring_file), 0);
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_ci		/* Prevent further access to the kioctx from migratepages */
31862306a36Sopenharmony_ci		i_mapping = aio_ring_file->f_mapping;
31962306a36Sopenharmony_ci		spin_lock(&i_mapping->private_lock);
32062306a36Sopenharmony_ci		i_mapping->private_data = NULL;
32162306a36Sopenharmony_ci		ctx->aio_ring_file = NULL;
32262306a36Sopenharmony_ci		spin_unlock(&i_mapping->private_lock);
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci		fput(aio_ring_file);
32562306a36Sopenharmony_ci	}
32662306a36Sopenharmony_ci}
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_cistatic void aio_free_ring(struct kioctx *ctx)
32962306a36Sopenharmony_ci{
33062306a36Sopenharmony_ci	int i;
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci	/* Disconnect the kiotx from the ring file.  This prevents future
33362306a36Sopenharmony_ci	 * accesses to the kioctx from page migration.
33462306a36Sopenharmony_ci	 */
33562306a36Sopenharmony_ci	put_aio_ring_file(ctx);
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	for (i = 0; i < ctx->nr_pages; i++) {
33862306a36Sopenharmony_ci		struct page *page;
33962306a36Sopenharmony_ci		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
34062306a36Sopenharmony_ci				page_count(ctx->ring_pages[i]));
34162306a36Sopenharmony_ci		page = ctx->ring_pages[i];
34262306a36Sopenharmony_ci		if (!page)
34362306a36Sopenharmony_ci			continue;
34462306a36Sopenharmony_ci		ctx->ring_pages[i] = NULL;
34562306a36Sopenharmony_ci		put_page(page);
34662306a36Sopenharmony_ci	}
34762306a36Sopenharmony_ci
34862306a36Sopenharmony_ci	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
34962306a36Sopenharmony_ci		kfree(ctx->ring_pages);
35062306a36Sopenharmony_ci		ctx->ring_pages = NULL;
35162306a36Sopenharmony_ci	}
35262306a36Sopenharmony_ci}
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_cistatic int aio_ring_mremap(struct vm_area_struct *vma)
35562306a36Sopenharmony_ci{
35662306a36Sopenharmony_ci	struct file *file = vma->vm_file;
35762306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
35862306a36Sopenharmony_ci	struct kioctx_table *table;
35962306a36Sopenharmony_ci	int i, res = -EINVAL;
36062306a36Sopenharmony_ci
36162306a36Sopenharmony_ci	spin_lock(&mm->ioctx_lock);
36262306a36Sopenharmony_ci	rcu_read_lock();
36362306a36Sopenharmony_ci	table = rcu_dereference(mm->ioctx_table);
36462306a36Sopenharmony_ci	if (!table)
36562306a36Sopenharmony_ci		goto out_unlock;
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci	for (i = 0; i < table->nr; i++) {
36862306a36Sopenharmony_ci		struct kioctx *ctx;
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci		ctx = rcu_dereference(table->table[i]);
37162306a36Sopenharmony_ci		if (ctx && ctx->aio_ring_file == file) {
37262306a36Sopenharmony_ci			if (!atomic_read(&ctx->dead)) {
37362306a36Sopenharmony_ci				ctx->user_id = ctx->mmap_base = vma->vm_start;
37462306a36Sopenharmony_ci				res = 0;
37562306a36Sopenharmony_ci			}
37662306a36Sopenharmony_ci			break;
37762306a36Sopenharmony_ci		}
37862306a36Sopenharmony_ci	}
37962306a36Sopenharmony_ci
38062306a36Sopenharmony_ciout_unlock:
38162306a36Sopenharmony_ci	rcu_read_unlock();
38262306a36Sopenharmony_ci	spin_unlock(&mm->ioctx_lock);
38362306a36Sopenharmony_ci	return res;
38462306a36Sopenharmony_ci}
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_cistatic const struct vm_operations_struct aio_ring_vm_ops = {
38762306a36Sopenharmony_ci	.mremap		= aio_ring_mremap,
38862306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MMU)
38962306a36Sopenharmony_ci	.fault		= filemap_fault,
39062306a36Sopenharmony_ci	.map_pages	= filemap_map_pages,
39162306a36Sopenharmony_ci	.page_mkwrite	= filemap_page_mkwrite,
39262306a36Sopenharmony_ci#endif
39362306a36Sopenharmony_ci};
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_cistatic int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
39662306a36Sopenharmony_ci{
39762306a36Sopenharmony_ci	vm_flags_set(vma, VM_DONTEXPAND);
39862306a36Sopenharmony_ci	vma->vm_ops = &aio_ring_vm_ops;
39962306a36Sopenharmony_ci	return 0;
40062306a36Sopenharmony_ci}
40162306a36Sopenharmony_ci
40262306a36Sopenharmony_cistatic const struct file_operations aio_ring_fops = {
40362306a36Sopenharmony_ci	.mmap = aio_ring_mmap,
40462306a36Sopenharmony_ci};
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_MIGRATION)
40762306a36Sopenharmony_cistatic int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
40862306a36Sopenharmony_ci			struct folio *src, enum migrate_mode mode)
40962306a36Sopenharmony_ci{
41062306a36Sopenharmony_ci	struct kioctx *ctx;
41162306a36Sopenharmony_ci	unsigned long flags;
41262306a36Sopenharmony_ci	pgoff_t idx;
41362306a36Sopenharmony_ci	int rc;
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci	/*
41662306a36Sopenharmony_ci	 * We cannot support the _NO_COPY case here, because copy needs to
41762306a36Sopenharmony_ci	 * happen under the ctx->completion_lock. That does not work with the
41862306a36Sopenharmony_ci	 * migration workflow of MIGRATE_SYNC_NO_COPY.
41962306a36Sopenharmony_ci	 */
42062306a36Sopenharmony_ci	if (mode == MIGRATE_SYNC_NO_COPY)
42162306a36Sopenharmony_ci		return -EINVAL;
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	rc = 0;
42462306a36Sopenharmony_ci
42562306a36Sopenharmony_ci	/* mapping->private_lock here protects against the kioctx teardown.  */
42662306a36Sopenharmony_ci	spin_lock(&mapping->private_lock);
42762306a36Sopenharmony_ci	ctx = mapping->private_data;
42862306a36Sopenharmony_ci	if (!ctx) {
42962306a36Sopenharmony_ci		rc = -EINVAL;
43062306a36Sopenharmony_ci		goto out;
43162306a36Sopenharmony_ci	}
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci	/* The ring_lock mutex.  The prevents aio_read_events() from writing
43462306a36Sopenharmony_ci	 * to the ring's head, and prevents page migration from mucking in
43562306a36Sopenharmony_ci	 * a partially initialized kiotx.
43662306a36Sopenharmony_ci	 */
43762306a36Sopenharmony_ci	if (!mutex_trylock(&ctx->ring_lock)) {
43862306a36Sopenharmony_ci		rc = -EAGAIN;
43962306a36Sopenharmony_ci		goto out;
44062306a36Sopenharmony_ci	}
44162306a36Sopenharmony_ci
44262306a36Sopenharmony_ci	idx = src->index;
44362306a36Sopenharmony_ci	if (idx < (pgoff_t)ctx->nr_pages) {
44462306a36Sopenharmony_ci		/* Make sure the old folio hasn't already been changed */
44562306a36Sopenharmony_ci		if (ctx->ring_pages[idx] != &src->page)
44662306a36Sopenharmony_ci			rc = -EAGAIN;
44762306a36Sopenharmony_ci	} else
44862306a36Sopenharmony_ci		rc = -EINVAL;
44962306a36Sopenharmony_ci
45062306a36Sopenharmony_ci	if (rc != 0)
45162306a36Sopenharmony_ci		goto out_unlock;
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_ci	/* Writeback must be complete */
45462306a36Sopenharmony_ci	BUG_ON(folio_test_writeback(src));
45562306a36Sopenharmony_ci	folio_get(dst);
45662306a36Sopenharmony_ci
45762306a36Sopenharmony_ci	rc = folio_migrate_mapping(mapping, dst, src, 1);
45862306a36Sopenharmony_ci	if (rc != MIGRATEPAGE_SUCCESS) {
45962306a36Sopenharmony_ci		folio_put(dst);
46062306a36Sopenharmony_ci		goto out_unlock;
46162306a36Sopenharmony_ci	}
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ci	/* Take completion_lock to prevent other writes to the ring buffer
46462306a36Sopenharmony_ci	 * while the old folio is copied to the new.  This prevents new
46562306a36Sopenharmony_ci	 * events from being lost.
46662306a36Sopenharmony_ci	 */
46762306a36Sopenharmony_ci	spin_lock_irqsave(&ctx->completion_lock, flags);
46862306a36Sopenharmony_ci	folio_migrate_copy(dst, src);
46962306a36Sopenharmony_ci	BUG_ON(ctx->ring_pages[idx] != &src->page);
47062306a36Sopenharmony_ci	ctx->ring_pages[idx] = &dst->page;
47162306a36Sopenharmony_ci	spin_unlock_irqrestore(&ctx->completion_lock, flags);
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci	/* The old folio is no longer accessible. */
47462306a36Sopenharmony_ci	folio_put(src);
47562306a36Sopenharmony_ci
47662306a36Sopenharmony_ciout_unlock:
47762306a36Sopenharmony_ci	mutex_unlock(&ctx->ring_lock);
47862306a36Sopenharmony_ciout:
47962306a36Sopenharmony_ci	spin_unlock(&mapping->private_lock);
48062306a36Sopenharmony_ci	return rc;
48162306a36Sopenharmony_ci}
48262306a36Sopenharmony_ci#else
48362306a36Sopenharmony_ci#define aio_migrate_folio NULL
48462306a36Sopenharmony_ci#endif
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_cistatic const struct address_space_operations aio_ctx_aops = {
48762306a36Sopenharmony_ci	.dirty_folio	= noop_dirty_folio,
48862306a36Sopenharmony_ci	.migrate_folio	= aio_migrate_folio,
48962306a36Sopenharmony_ci};
49062306a36Sopenharmony_ci
49162306a36Sopenharmony_cistatic int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
49262306a36Sopenharmony_ci{
49362306a36Sopenharmony_ci	struct aio_ring *ring;
49462306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
49562306a36Sopenharmony_ci	unsigned long size, unused;
49662306a36Sopenharmony_ci	int nr_pages;
49762306a36Sopenharmony_ci	int i;
49862306a36Sopenharmony_ci	struct file *file;
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	/* Compensate for the ring buffer's head/tail overlap entry */
50162306a36Sopenharmony_ci	nr_events += 2;	/* 1 is required, 2 for good luck */
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci	size = sizeof(struct aio_ring);
50462306a36Sopenharmony_ci	size += sizeof(struct io_event) * nr_events;
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci	nr_pages = PFN_UP(size);
50762306a36Sopenharmony_ci	if (nr_pages < 0)
50862306a36Sopenharmony_ci		return -EINVAL;
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci	file = aio_private_file(ctx, nr_pages);
51162306a36Sopenharmony_ci	if (IS_ERR(file)) {
51262306a36Sopenharmony_ci		ctx->aio_ring_file = NULL;
51362306a36Sopenharmony_ci		return -ENOMEM;
51462306a36Sopenharmony_ci	}
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_ci	ctx->aio_ring_file = file;
51762306a36Sopenharmony_ci	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
51862306a36Sopenharmony_ci			/ sizeof(struct io_event);
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci	ctx->ring_pages = ctx->internal_pages;
52162306a36Sopenharmony_ci	if (nr_pages > AIO_RING_PAGES) {
52262306a36Sopenharmony_ci		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
52362306a36Sopenharmony_ci					  GFP_KERNEL);
52462306a36Sopenharmony_ci		if (!ctx->ring_pages) {
52562306a36Sopenharmony_ci			put_aio_ring_file(ctx);
52662306a36Sopenharmony_ci			return -ENOMEM;
52762306a36Sopenharmony_ci		}
52862306a36Sopenharmony_ci	}
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci	for (i = 0; i < nr_pages; i++) {
53162306a36Sopenharmony_ci		struct page *page;
53262306a36Sopenharmony_ci		page = find_or_create_page(file->f_mapping,
53362306a36Sopenharmony_ci					   i, GFP_USER | __GFP_ZERO);
53462306a36Sopenharmony_ci		if (!page)
53562306a36Sopenharmony_ci			break;
53662306a36Sopenharmony_ci		pr_debug("pid(%d) page[%d]->count=%d\n",
53762306a36Sopenharmony_ci			 current->pid, i, page_count(page));
53862306a36Sopenharmony_ci		SetPageUptodate(page);
53962306a36Sopenharmony_ci		unlock_page(page);
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci		ctx->ring_pages[i] = page;
54262306a36Sopenharmony_ci	}
54362306a36Sopenharmony_ci	ctx->nr_pages = i;
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ci	if (unlikely(i != nr_pages)) {
54662306a36Sopenharmony_ci		aio_free_ring(ctx);
54762306a36Sopenharmony_ci		return -ENOMEM;
54862306a36Sopenharmony_ci	}
54962306a36Sopenharmony_ci
55062306a36Sopenharmony_ci	ctx->mmap_size = nr_pages * PAGE_SIZE;
55162306a36Sopenharmony_ci	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci	if (mmap_write_lock_killable(mm)) {
55462306a36Sopenharmony_ci		ctx->mmap_size = 0;
55562306a36Sopenharmony_ci		aio_free_ring(ctx);
55662306a36Sopenharmony_ci		return -EINTR;
55762306a36Sopenharmony_ci	}
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci	ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
56062306a36Sopenharmony_ci				 PROT_READ | PROT_WRITE,
56162306a36Sopenharmony_ci				 MAP_SHARED, 0, 0, &unused, NULL);
56262306a36Sopenharmony_ci	mmap_write_unlock(mm);
56362306a36Sopenharmony_ci	if (IS_ERR((void *)ctx->mmap_base)) {
56462306a36Sopenharmony_ci		ctx->mmap_size = 0;
56562306a36Sopenharmony_ci		aio_free_ring(ctx);
56662306a36Sopenharmony_ci		return -ENOMEM;
56762306a36Sopenharmony_ci	}
56862306a36Sopenharmony_ci
56962306a36Sopenharmony_ci	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ci	ctx->user_id = ctx->mmap_base;
57262306a36Sopenharmony_ci	ctx->nr_events = nr_events; /* trusted copy */
57362306a36Sopenharmony_ci
57462306a36Sopenharmony_ci	ring = page_address(ctx->ring_pages[0]);
57562306a36Sopenharmony_ci	ring->nr = nr_events;	/* user copy */
57662306a36Sopenharmony_ci	ring->id = ~0U;
57762306a36Sopenharmony_ci	ring->head = ring->tail = 0;
57862306a36Sopenharmony_ci	ring->magic = AIO_RING_MAGIC;
57962306a36Sopenharmony_ci	ring->compat_features = AIO_RING_COMPAT_FEATURES;
58062306a36Sopenharmony_ci	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
58162306a36Sopenharmony_ci	ring->header_length = sizeof(struct aio_ring);
58262306a36Sopenharmony_ci	flush_dcache_page(ctx->ring_pages[0]);
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_ci	return 0;
58562306a36Sopenharmony_ci}
58662306a36Sopenharmony_ci
58762306a36Sopenharmony_ci#define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
58862306a36Sopenharmony_ci#define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
58962306a36Sopenharmony_ci#define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_civoid kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
59262306a36Sopenharmony_ci{
59362306a36Sopenharmony_ci	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
59462306a36Sopenharmony_ci	struct kioctx *ctx = req->ki_ctx;
59562306a36Sopenharmony_ci	unsigned long flags;
59662306a36Sopenharmony_ci
59762306a36Sopenharmony_ci	/*
59862306a36Sopenharmony_ci	 * kiocb didn't come from aio or is neither a read nor a write, hence
59962306a36Sopenharmony_ci	 * ignore it.
60062306a36Sopenharmony_ci	 */
60162306a36Sopenharmony_ci	if (!(iocb->ki_flags & IOCB_AIO_RW))
60262306a36Sopenharmony_ci		return;
60362306a36Sopenharmony_ci
60462306a36Sopenharmony_ci	if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
60562306a36Sopenharmony_ci		return;
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci	spin_lock_irqsave(&ctx->ctx_lock, flags);
60862306a36Sopenharmony_ci	list_add_tail(&req->ki_list, &ctx->active_reqs);
60962306a36Sopenharmony_ci	req->ki_cancel = cancel;
61062306a36Sopenharmony_ci	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
61162306a36Sopenharmony_ci}
61262306a36Sopenharmony_ciEXPORT_SYMBOL(kiocb_set_cancel_fn);
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci/*
61562306a36Sopenharmony_ci * free_ioctx() should be RCU delayed to synchronize against the RCU
61662306a36Sopenharmony_ci * protected lookup_ioctx() and also needs process context to call
61762306a36Sopenharmony_ci * aio_free_ring().  Use rcu_work.
61862306a36Sopenharmony_ci */
61962306a36Sopenharmony_cistatic void free_ioctx(struct work_struct *work)
62062306a36Sopenharmony_ci{
62162306a36Sopenharmony_ci	struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
62262306a36Sopenharmony_ci					  free_rwork);
62362306a36Sopenharmony_ci	pr_debug("freeing %p\n", ctx);
62462306a36Sopenharmony_ci
62562306a36Sopenharmony_ci	aio_free_ring(ctx);
62662306a36Sopenharmony_ci	free_percpu(ctx->cpu);
62762306a36Sopenharmony_ci	percpu_ref_exit(&ctx->reqs);
62862306a36Sopenharmony_ci	percpu_ref_exit(&ctx->users);
62962306a36Sopenharmony_ci	kmem_cache_free(kioctx_cachep, ctx);
63062306a36Sopenharmony_ci}
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_cistatic void free_ioctx_reqs(struct percpu_ref *ref)
63362306a36Sopenharmony_ci{
63462306a36Sopenharmony_ci	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
63562306a36Sopenharmony_ci
63662306a36Sopenharmony_ci	/* At this point we know that there are no any in-flight requests */
63762306a36Sopenharmony_ci	if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
63862306a36Sopenharmony_ci		complete(&ctx->rq_wait->comp);
63962306a36Sopenharmony_ci
64062306a36Sopenharmony_ci	/* Synchronize against RCU protected table->table[] dereferences */
64162306a36Sopenharmony_ci	INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
64262306a36Sopenharmony_ci	queue_rcu_work(system_wq, &ctx->free_rwork);
64362306a36Sopenharmony_ci}
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ci/*
64662306a36Sopenharmony_ci * When this function runs, the kioctx has been removed from the "hash table"
64762306a36Sopenharmony_ci * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
64862306a36Sopenharmony_ci * now it's safe to cancel any that need to be.
64962306a36Sopenharmony_ci */
65062306a36Sopenharmony_cistatic void free_ioctx_users(struct percpu_ref *ref)
65162306a36Sopenharmony_ci{
65262306a36Sopenharmony_ci	struct kioctx *ctx = container_of(ref, struct kioctx, users);
65362306a36Sopenharmony_ci	struct aio_kiocb *req;
65462306a36Sopenharmony_ci
65562306a36Sopenharmony_ci	spin_lock_irq(&ctx->ctx_lock);
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci	while (!list_empty(&ctx->active_reqs)) {
65862306a36Sopenharmony_ci		req = list_first_entry(&ctx->active_reqs,
65962306a36Sopenharmony_ci				       struct aio_kiocb, ki_list);
66062306a36Sopenharmony_ci		req->ki_cancel(&req->rw);
66162306a36Sopenharmony_ci		list_del_init(&req->ki_list);
66262306a36Sopenharmony_ci	}
66362306a36Sopenharmony_ci
66462306a36Sopenharmony_ci	spin_unlock_irq(&ctx->ctx_lock);
66562306a36Sopenharmony_ci
66662306a36Sopenharmony_ci	percpu_ref_kill(&ctx->reqs);
66762306a36Sopenharmony_ci	percpu_ref_put(&ctx->reqs);
66862306a36Sopenharmony_ci}
66962306a36Sopenharmony_ci
67062306a36Sopenharmony_cistatic int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
67162306a36Sopenharmony_ci{
67262306a36Sopenharmony_ci	unsigned i, new_nr;
67362306a36Sopenharmony_ci	struct kioctx_table *table, *old;
67462306a36Sopenharmony_ci	struct aio_ring *ring;
67562306a36Sopenharmony_ci
67662306a36Sopenharmony_ci	spin_lock(&mm->ioctx_lock);
67762306a36Sopenharmony_ci	table = rcu_dereference_raw(mm->ioctx_table);
67862306a36Sopenharmony_ci
67962306a36Sopenharmony_ci	while (1) {
68062306a36Sopenharmony_ci		if (table)
68162306a36Sopenharmony_ci			for (i = 0; i < table->nr; i++)
68262306a36Sopenharmony_ci				if (!rcu_access_pointer(table->table[i])) {
68362306a36Sopenharmony_ci					ctx->id = i;
68462306a36Sopenharmony_ci					rcu_assign_pointer(table->table[i], ctx);
68562306a36Sopenharmony_ci					spin_unlock(&mm->ioctx_lock);
68662306a36Sopenharmony_ci
68762306a36Sopenharmony_ci					/* While kioctx setup is in progress,
68862306a36Sopenharmony_ci					 * we are protected from page migration
68962306a36Sopenharmony_ci					 * changes ring_pages by ->ring_lock.
69062306a36Sopenharmony_ci					 */
69162306a36Sopenharmony_ci					ring = page_address(ctx->ring_pages[0]);
69262306a36Sopenharmony_ci					ring->id = ctx->id;
69362306a36Sopenharmony_ci					return 0;
69462306a36Sopenharmony_ci				}
69562306a36Sopenharmony_ci
69662306a36Sopenharmony_ci		new_nr = (table ? table->nr : 1) * 4;
69762306a36Sopenharmony_ci		spin_unlock(&mm->ioctx_lock);
69862306a36Sopenharmony_ci
69962306a36Sopenharmony_ci		table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL);
70062306a36Sopenharmony_ci		if (!table)
70162306a36Sopenharmony_ci			return -ENOMEM;
70262306a36Sopenharmony_ci
70362306a36Sopenharmony_ci		table->nr = new_nr;
70462306a36Sopenharmony_ci
70562306a36Sopenharmony_ci		spin_lock(&mm->ioctx_lock);
70662306a36Sopenharmony_ci		old = rcu_dereference_raw(mm->ioctx_table);
70762306a36Sopenharmony_ci
70862306a36Sopenharmony_ci		if (!old) {
70962306a36Sopenharmony_ci			rcu_assign_pointer(mm->ioctx_table, table);
71062306a36Sopenharmony_ci		} else if (table->nr > old->nr) {
71162306a36Sopenharmony_ci			memcpy(table->table, old->table,
71262306a36Sopenharmony_ci			       old->nr * sizeof(struct kioctx *));
71362306a36Sopenharmony_ci
71462306a36Sopenharmony_ci			rcu_assign_pointer(mm->ioctx_table, table);
71562306a36Sopenharmony_ci			kfree_rcu(old, rcu);
71662306a36Sopenharmony_ci		} else {
71762306a36Sopenharmony_ci			kfree(table);
71862306a36Sopenharmony_ci			table = old;
71962306a36Sopenharmony_ci		}
72062306a36Sopenharmony_ci	}
72162306a36Sopenharmony_ci}
72262306a36Sopenharmony_ci
72362306a36Sopenharmony_cistatic void aio_nr_sub(unsigned nr)
72462306a36Sopenharmony_ci{
72562306a36Sopenharmony_ci	spin_lock(&aio_nr_lock);
72662306a36Sopenharmony_ci	if (WARN_ON(aio_nr - nr > aio_nr))
72762306a36Sopenharmony_ci		aio_nr = 0;
72862306a36Sopenharmony_ci	else
72962306a36Sopenharmony_ci		aio_nr -= nr;
73062306a36Sopenharmony_ci	spin_unlock(&aio_nr_lock);
73162306a36Sopenharmony_ci}
73262306a36Sopenharmony_ci
73362306a36Sopenharmony_ci/* ioctx_alloc
73462306a36Sopenharmony_ci *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
73562306a36Sopenharmony_ci */
73662306a36Sopenharmony_cistatic struct kioctx *ioctx_alloc(unsigned nr_events)
73762306a36Sopenharmony_ci{
73862306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
73962306a36Sopenharmony_ci	struct kioctx *ctx;
74062306a36Sopenharmony_ci	int err = -ENOMEM;
74162306a36Sopenharmony_ci
74262306a36Sopenharmony_ci	/*
74362306a36Sopenharmony_ci	 * Store the original nr_events -- what userspace passed to io_setup(),
74462306a36Sopenharmony_ci	 * for counting against the global limit -- before it changes.
74562306a36Sopenharmony_ci	 */
74662306a36Sopenharmony_ci	unsigned int max_reqs = nr_events;
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci	/*
74962306a36Sopenharmony_ci	 * We keep track of the number of available ringbuffer slots, to prevent
75062306a36Sopenharmony_ci	 * overflow (reqs_available), and we also use percpu counters for this.
75162306a36Sopenharmony_ci	 *
75262306a36Sopenharmony_ci	 * So since up to half the slots might be on other cpu's percpu counters
75362306a36Sopenharmony_ci	 * and unavailable, double nr_events so userspace sees what they
75462306a36Sopenharmony_ci	 * expected: additionally, we move req_batch slots to/from percpu
75562306a36Sopenharmony_ci	 * counters at a time, so make sure that isn't 0:
75662306a36Sopenharmony_ci	 */
75762306a36Sopenharmony_ci	nr_events = max(nr_events, num_possible_cpus() * 4);
75862306a36Sopenharmony_ci	nr_events *= 2;
75962306a36Sopenharmony_ci
76062306a36Sopenharmony_ci	/* Prevent overflows */
76162306a36Sopenharmony_ci	if (nr_events > (0x10000000U / sizeof(struct io_event))) {
76262306a36Sopenharmony_ci		pr_debug("ENOMEM: nr_events too high\n");
76362306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
76462306a36Sopenharmony_ci	}
76562306a36Sopenharmony_ci
76662306a36Sopenharmony_ci	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
76762306a36Sopenharmony_ci		return ERR_PTR(-EAGAIN);
76862306a36Sopenharmony_ci
76962306a36Sopenharmony_ci	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
77062306a36Sopenharmony_ci	if (!ctx)
77162306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
77262306a36Sopenharmony_ci
77362306a36Sopenharmony_ci	ctx->max_reqs = max_reqs;
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci	spin_lock_init(&ctx->ctx_lock);
77662306a36Sopenharmony_ci	spin_lock_init(&ctx->completion_lock);
77762306a36Sopenharmony_ci	mutex_init(&ctx->ring_lock);
77862306a36Sopenharmony_ci	/* Protect against page migration throughout kiotx setup by keeping
77962306a36Sopenharmony_ci	 * the ring_lock mutex held until setup is complete. */
78062306a36Sopenharmony_ci	mutex_lock(&ctx->ring_lock);
78162306a36Sopenharmony_ci	init_waitqueue_head(&ctx->wait);
78262306a36Sopenharmony_ci
78362306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->active_reqs);
78462306a36Sopenharmony_ci
78562306a36Sopenharmony_ci	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
78662306a36Sopenharmony_ci		goto err;
78762306a36Sopenharmony_ci
78862306a36Sopenharmony_ci	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
78962306a36Sopenharmony_ci		goto err;
79062306a36Sopenharmony_ci
79162306a36Sopenharmony_ci	ctx->cpu = alloc_percpu(struct kioctx_cpu);
79262306a36Sopenharmony_ci	if (!ctx->cpu)
79362306a36Sopenharmony_ci		goto err;
79462306a36Sopenharmony_ci
79562306a36Sopenharmony_ci	err = aio_setup_ring(ctx, nr_events);
79662306a36Sopenharmony_ci	if (err < 0)
79762306a36Sopenharmony_ci		goto err;
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_ci	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
80062306a36Sopenharmony_ci	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
80162306a36Sopenharmony_ci	if (ctx->req_batch < 1)
80262306a36Sopenharmony_ci		ctx->req_batch = 1;
80362306a36Sopenharmony_ci
80462306a36Sopenharmony_ci	/* limit the number of system wide aios */
80562306a36Sopenharmony_ci	spin_lock(&aio_nr_lock);
80662306a36Sopenharmony_ci	if (aio_nr + ctx->max_reqs > aio_max_nr ||
80762306a36Sopenharmony_ci	    aio_nr + ctx->max_reqs < aio_nr) {
80862306a36Sopenharmony_ci		spin_unlock(&aio_nr_lock);
80962306a36Sopenharmony_ci		err = -EAGAIN;
81062306a36Sopenharmony_ci		goto err_ctx;
81162306a36Sopenharmony_ci	}
81262306a36Sopenharmony_ci	aio_nr += ctx->max_reqs;
81362306a36Sopenharmony_ci	spin_unlock(&aio_nr_lock);
81462306a36Sopenharmony_ci
81562306a36Sopenharmony_ci	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
81662306a36Sopenharmony_ci	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
81762306a36Sopenharmony_ci
81862306a36Sopenharmony_ci	err = ioctx_add_table(ctx, mm);
81962306a36Sopenharmony_ci	if (err)
82062306a36Sopenharmony_ci		goto err_cleanup;
82162306a36Sopenharmony_ci
82262306a36Sopenharmony_ci	/* Release the ring_lock mutex now that all setup is complete. */
82362306a36Sopenharmony_ci	mutex_unlock(&ctx->ring_lock);
82462306a36Sopenharmony_ci
82562306a36Sopenharmony_ci	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
82662306a36Sopenharmony_ci		 ctx, ctx->user_id, mm, ctx->nr_events);
82762306a36Sopenharmony_ci	return ctx;
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_cierr_cleanup:
83062306a36Sopenharmony_ci	aio_nr_sub(ctx->max_reqs);
83162306a36Sopenharmony_cierr_ctx:
83262306a36Sopenharmony_ci	atomic_set(&ctx->dead, 1);
83362306a36Sopenharmony_ci	if (ctx->mmap_size)
83462306a36Sopenharmony_ci		vm_munmap(ctx->mmap_base, ctx->mmap_size);
83562306a36Sopenharmony_ci	aio_free_ring(ctx);
83662306a36Sopenharmony_cierr:
83762306a36Sopenharmony_ci	mutex_unlock(&ctx->ring_lock);
83862306a36Sopenharmony_ci	free_percpu(ctx->cpu);
83962306a36Sopenharmony_ci	percpu_ref_exit(&ctx->reqs);
84062306a36Sopenharmony_ci	percpu_ref_exit(&ctx->users);
84162306a36Sopenharmony_ci	kmem_cache_free(kioctx_cachep, ctx);
84262306a36Sopenharmony_ci	pr_debug("error allocating ioctx %d\n", err);
84362306a36Sopenharmony_ci	return ERR_PTR(err);
84462306a36Sopenharmony_ci}
84562306a36Sopenharmony_ci
84662306a36Sopenharmony_ci/* kill_ioctx
84762306a36Sopenharmony_ci *	Cancels all outstanding aio requests on an aio context.  Used
84862306a36Sopenharmony_ci *	when the processes owning a context have all exited to encourage
84962306a36Sopenharmony_ci *	the rapid destruction of the kioctx.
85062306a36Sopenharmony_ci */
85162306a36Sopenharmony_cistatic int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
85262306a36Sopenharmony_ci		      struct ctx_rq_wait *wait)
85362306a36Sopenharmony_ci{
85462306a36Sopenharmony_ci	struct kioctx_table *table;
85562306a36Sopenharmony_ci
85662306a36Sopenharmony_ci	spin_lock(&mm->ioctx_lock);
85762306a36Sopenharmony_ci	if (atomic_xchg(&ctx->dead, 1)) {
85862306a36Sopenharmony_ci		spin_unlock(&mm->ioctx_lock);
85962306a36Sopenharmony_ci		return -EINVAL;
86062306a36Sopenharmony_ci	}
86162306a36Sopenharmony_ci
86262306a36Sopenharmony_ci	table = rcu_dereference_raw(mm->ioctx_table);
86362306a36Sopenharmony_ci	WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
86462306a36Sopenharmony_ci	RCU_INIT_POINTER(table->table[ctx->id], NULL);
86562306a36Sopenharmony_ci	spin_unlock(&mm->ioctx_lock);
86662306a36Sopenharmony_ci
86762306a36Sopenharmony_ci	/* free_ioctx_reqs() will do the necessary RCU synchronization */
86862306a36Sopenharmony_ci	wake_up_all(&ctx->wait);
86962306a36Sopenharmony_ci
87062306a36Sopenharmony_ci	/*
87162306a36Sopenharmony_ci	 * It'd be more correct to do this in free_ioctx(), after all
87262306a36Sopenharmony_ci	 * the outstanding kiocbs have finished - but by then io_destroy
87362306a36Sopenharmony_ci	 * has already returned, so io_setup() could potentially return
87462306a36Sopenharmony_ci	 * -EAGAIN with no ioctxs actually in use (as far as userspace
87562306a36Sopenharmony_ci	 *  could tell).
87662306a36Sopenharmony_ci	 */
87762306a36Sopenharmony_ci	aio_nr_sub(ctx->max_reqs);
87862306a36Sopenharmony_ci
87962306a36Sopenharmony_ci	if (ctx->mmap_size)
88062306a36Sopenharmony_ci		vm_munmap(ctx->mmap_base, ctx->mmap_size);
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci	ctx->rq_wait = wait;
88362306a36Sopenharmony_ci	percpu_ref_kill(&ctx->users);
88462306a36Sopenharmony_ci	return 0;
88562306a36Sopenharmony_ci}
88662306a36Sopenharmony_ci
88762306a36Sopenharmony_ci/*
88862306a36Sopenharmony_ci * exit_aio: called when the last user of mm goes away.  At this point, there is
88962306a36Sopenharmony_ci * no way for any new requests to be submited or any of the io_* syscalls to be
89062306a36Sopenharmony_ci * called on the context.
89162306a36Sopenharmony_ci *
89262306a36Sopenharmony_ci * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
89362306a36Sopenharmony_ci * them.
89462306a36Sopenharmony_ci */
89562306a36Sopenharmony_civoid exit_aio(struct mm_struct *mm)
89662306a36Sopenharmony_ci{
89762306a36Sopenharmony_ci	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
89862306a36Sopenharmony_ci	struct ctx_rq_wait wait;
89962306a36Sopenharmony_ci	int i, skipped;
90062306a36Sopenharmony_ci
90162306a36Sopenharmony_ci	if (!table)
90262306a36Sopenharmony_ci		return;
90362306a36Sopenharmony_ci
90462306a36Sopenharmony_ci	atomic_set(&wait.count, table->nr);
90562306a36Sopenharmony_ci	init_completion(&wait.comp);
90662306a36Sopenharmony_ci
90762306a36Sopenharmony_ci	skipped = 0;
90862306a36Sopenharmony_ci	for (i = 0; i < table->nr; ++i) {
90962306a36Sopenharmony_ci		struct kioctx *ctx =
91062306a36Sopenharmony_ci			rcu_dereference_protected(table->table[i], true);
91162306a36Sopenharmony_ci
91262306a36Sopenharmony_ci		if (!ctx) {
91362306a36Sopenharmony_ci			skipped++;
91462306a36Sopenharmony_ci			continue;
91562306a36Sopenharmony_ci		}
91662306a36Sopenharmony_ci
91762306a36Sopenharmony_ci		/*
91862306a36Sopenharmony_ci		 * We don't need to bother with munmap() here - exit_mmap(mm)
91962306a36Sopenharmony_ci		 * is coming and it'll unmap everything. And we simply can't,
92062306a36Sopenharmony_ci		 * this is not necessarily our ->mm.
92162306a36Sopenharmony_ci		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
92262306a36Sopenharmony_ci		 * that it needs to unmap the area, just set it to 0.
92362306a36Sopenharmony_ci		 */
92462306a36Sopenharmony_ci		ctx->mmap_size = 0;
92562306a36Sopenharmony_ci		kill_ioctx(mm, ctx, &wait);
92662306a36Sopenharmony_ci	}
92762306a36Sopenharmony_ci
92862306a36Sopenharmony_ci	if (!atomic_sub_and_test(skipped, &wait.count)) {
92962306a36Sopenharmony_ci		/* Wait until all IO for the context are done. */
93062306a36Sopenharmony_ci		wait_for_completion(&wait.comp);
93162306a36Sopenharmony_ci	}
93262306a36Sopenharmony_ci
93362306a36Sopenharmony_ci	RCU_INIT_POINTER(mm->ioctx_table, NULL);
93462306a36Sopenharmony_ci	kfree(table);
93562306a36Sopenharmony_ci}
93662306a36Sopenharmony_ci
93762306a36Sopenharmony_cistatic void put_reqs_available(struct kioctx *ctx, unsigned nr)
93862306a36Sopenharmony_ci{
93962306a36Sopenharmony_ci	struct kioctx_cpu *kcpu;
94062306a36Sopenharmony_ci	unsigned long flags;
94162306a36Sopenharmony_ci
94262306a36Sopenharmony_ci	local_irq_save(flags);
94362306a36Sopenharmony_ci	kcpu = this_cpu_ptr(ctx->cpu);
94462306a36Sopenharmony_ci	kcpu->reqs_available += nr;
94562306a36Sopenharmony_ci
94662306a36Sopenharmony_ci	while (kcpu->reqs_available >= ctx->req_batch * 2) {
94762306a36Sopenharmony_ci		kcpu->reqs_available -= ctx->req_batch;
94862306a36Sopenharmony_ci		atomic_add(ctx->req_batch, &ctx->reqs_available);
94962306a36Sopenharmony_ci	}
95062306a36Sopenharmony_ci
95162306a36Sopenharmony_ci	local_irq_restore(flags);
95262306a36Sopenharmony_ci}
95362306a36Sopenharmony_ci
95462306a36Sopenharmony_cistatic bool __get_reqs_available(struct kioctx *ctx)
95562306a36Sopenharmony_ci{
95662306a36Sopenharmony_ci	struct kioctx_cpu *kcpu;
95762306a36Sopenharmony_ci	bool ret = false;
95862306a36Sopenharmony_ci	unsigned long flags;
95962306a36Sopenharmony_ci
96062306a36Sopenharmony_ci	local_irq_save(flags);
96162306a36Sopenharmony_ci	kcpu = this_cpu_ptr(ctx->cpu);
96262306a36Sopenharmony_ci	if (!kcpu->reqs_available) {
96362306a36Sopenharmony_ci		int avail = atomic_read(&ctx->reqs_available);
96462306a36Sopenharmony_ci
96562306a36Sopenharmony_ci		do {
96662306a36Sopenharmony_ci			if (avail < ctx->req_batch)
96762306a36Sopenharmony_ci				goto out;
96862306a36Sopenharmony_ci		} while (!atomic_try_cmpxchg(&ctx->reqs_available,
96962306a36Sopenharmony_ci					     &avail, avail - ctx->req_batch));
97062306a36Sopenharmony_ci
97162306a36Sopenharmony_ci		kcpu->reqs_available += ctx->req_batch;
97262306a36Sopenharmony_ci	}
97362306a36Sopenharmony_ci
97462306a36Sopenharmony_ci	ret = true;
97562306a36Sopenharmony_ci	kcpu->reqs_available--;
97662306a36Sopenharmony_ciout:
97762306a36Sopenharmony_ci	local_irq_restore(flags);
97862306a36Sopenharmony_ci	return ret;
97962306a36Sopenharmony_ci}
98062306a36Sopenharmony_ci
98162306a36Sopenharmony_ci/* refill_reqs_available
98262306a36Sopenharmony_ci *	Updates the reqs_available reference counts used for tracking the
98362306a36Sopenharmony_ci *	number of free slots in the completion ring.  This can be called
98462306a36Sopenharmony_ci *	from aio_complete() (to optimistically update reqs_available) or
98562306a36Sopenharmony_ci *	from aio_get_req() (the we're out of events case).  It must be
98662306a36Sopenharmony_ci *	called holding ctx->completion_lock.
98762306a36Sopenharmony_ci */
98862306a36Sopenharmony_cistatic void refill_reqs_available(struct kioctx *ctx, unsigned head,
98962306a36Sopenharmony_ci                                  unsigned tail)
99062306a36Sopenharmony_ci{
99162306a36Sopenharmony_ci	unsigned events_in_ring, completed;
99262306a36Sopenharmony_ci
99362306a36Sopenharmony_ci	/* Clamp head since userland can write to it. */
99462306a36Sopenharmony_ci	head %= ctx->nr_events;
99562306a36Sopenharmony_ci	if (head <= tail)
99662306a36Sopenharmony_ci		events_in_ring = tail - head;
99762306a36Sopenharmony_ci	else
99862306a36Sopenharmony_ci		events_in_ring = ctx->nr_events - (head - tail);
99962306a36Sopenharmony_ci
100062306a36Sopenharmony_ci	completed = ctx->completed_events;
100162306a36Sopenharmony_ci	if (events_in_ring < completed)
100262306a36Sopenharmony_ci		completed -= events_in_ring;
100362306a36Sopenharmony_ci	else
100462306a36Sopenharmony_ci		completed = 0;
100562306a36Sopenharmony_ci
100662306a36Sopenharmony_ci	if (!completed)
100762306a36Sopenharmony_ci		return;
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_ci	ctx->completed_events -= completed;
101062306a36Sopenharmony_ci	put_reqs_available(ctx, completed);
101162306a36Sopenharmony_ci}
101262306a36Sopenharmony_ci
101362306a36Sopenharmony_ci/* user_refill_reqs_available
101462306a36Sopenharmony_ci *	Called to refill reqs_available when aio_get_req() encounters an
101562306a36Sopenharmony_ci *	out of space in the completion ring.
101662306a36Sopenharmony_ci */
101762306a36Sopenharmony_cistatic void user_refill_reqs_available(struct kioctx *ctx)
101862306a36Sopenharmony_ci{
101962306a36Sopenharmony_ci	spin_lock_irq(&ctx->completion_lock);
102062306a36Sopenharmony_ci	if (ctx->completed_events) {
102162306a36Sopenharmony_ci		struct aio_ring *ring;
102262306a36Sopenharmony_ci		unsigned head;
102362306a36Sopenharmony_ci
102462306a36Sopenharmony_ci		/* Access of ring->head may race with aio_read_events_ring()
102562306a36Sopenharmony_ci		 * here, but that's okay since whether we read the old version
102662306a36Sopenharmony_ci		 * or the new version, and either will be valid.  The important
102762306a36Sopenharmony_ci		 * part is that head cannot pass tail since we prevent
102862306a36Sopenharmony_ci		 * aio_complete() from updating tail by holding
102962306a36Sopenharmony_ci		 * ctx->completion_lock.  Even if head is invalid, the check
103062306a36Sopenharmony_ci		 * against ctx->completed_events below will make sure we do the
103162306a36Sopenharmony_ci		 * safe/right thing.
103262306a36Sopenharmony_ci		 */
103362306a36Sopenharmony_ci		ring = page_address(ctx->ring_pages[0]);
103462306a36Sopenharmony_ci		head = ring->head;
103562306a36Sopenharmony_ci
103662306a36Sopenharmony_ci		refill_reqs_available(ctx, head, ctx->tail);
103762306a36Sopenharmony_ci	}
103862306a36Sopenharmony_ci
103962306a36Sopenharmony_ci	spin_unlock_irq(&ctx->completion_lock);
104062306a36Sopenharmony_ci}
104162306a36Sopenharmony_ci
104262306a36Sopenharmony_cistatic bool get_reqs_available(struct kioctx *ctx)
104362306a36Sopenharmony_ci{
104462306a36Sopenharmony_ci	if (__get_reqs_available(ctx))
104562306a36Sopenharmony_ci		return true;
104662306a36Sopenharmony_ci	user_refill_reqs_available(ctx);
104762306a36Sopenharmony_ci	return __get_reqs_available(ctx);
104862306a36Sopenharmony_ci}
104962306a36Sopenharmony_ci
105062306a36Sopenharmony_ci/* aio_get_req
105162306a36Sopenharmony_ci *	Allocate a slot for an aio request.
105262306a36Sopenharmony_ci * Returns NULL if no requests are free.
105362306a36Sopenharmony_ci *
105462306a36Sopenharmony_ci * The refcount is initialized to 2 - one for the async op completion,
105562306a36Sopenharmony_ci * one for the synchronous code that does this.
105662306a36Sopenharmony_ci */
105762306a36Sopenharmony_cistatic inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
105862306a36Sopenharmony_ci{
105962306a36Sopenharmony_ci	struct aio_kiocb *req;
106062306a36Sopenharmony_ci
106162306a36Sopenharmony_ci	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
106262306a36Sopenharmony_ci	if (unlikely(!req))
106362306a36Sopenharmony_ci		return NULL;
106462306a36Sopenharmony_ci
106562306a36Sopenharmony_ci	if (unlikely(!get_reqs_available(ctx))) {
106662306a36Sopenharmony_ci		kmem_cache_free(kiocb_cachep, req);
106762306a36Sopenharmony_ci		return NULL;
106862306a36Sopenharmony_ci	}
106962306a36Sopenharmony_ci
107062306a36Sopenharmony_ci	percpu_ref_get(&ctx->reqs);
107162306a36Sopenharmony_ci	req->ki_ctx = ctx;
107262306a36Sopenharmony_ci	INIT_LIST_HEAD(&req->ki_list);
107362306a36Sopenharmony_ci	refcount_set(&req->ki_refcnt, 2);
107462306a36Sopenharmony_ci	req->ki_eventfd = NULL;
107562306a36Sopenharmony_ci	return req;
107662306a36Sopenharmony_ci}
107762306a36Sopenharmony_ci
107862306a36Sopenharmony_cistatic struct kioctx *lookup_ioctx(unsigned long ctx_id)
107962306a36Sopenharmony_ci{
108062306a36Sopenharmony_ci	struct aio_ring __user *ring  = (void __user *)ctx_id;
108162306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
108262306a36Sopenharmony_ci	struct kioctx *ctx, *ret = NULL;
108362306a36Sopenharmony_ci	struct kioctx_table *table;
108462306a36Sopenharmony_ci	unsigned id;
108562306a36Sopenharmony_ci
108662306a36Sopenharmony_ci	if (get_user(id, &ring->id))
108762306a36Sopenharmony_ci		return NULL;
108862306a36Sopenharmony_ci
108962306a36Sopenharmony_ci	rcu_read_lock();
109062306a36Sopenharmony_ci	table = rcu_dereference(mm->ioctx_table);
109162306a36Sopenharmony_ci
109262306a36Sopenharmony_ci	if (!table || id >= table->nr)
109362306a36Sopenharmony_ci		goto out;
109462306a36Sopenharmony_ci
109562306a36Sopenharmony_ci	id = array_index_nospec(id, table->nr);
109662306a36Sopenharmony_ci	ctx = rcu_dereference(table->table[id]);
109762306a36Sopenharmony_ci	if (ctx && ctx->user_id == ctx_id) {
109862306a36Sopenharmony_ci		if (percpu_ref_tryget_live(&ctx->users))
109962306a36Sopenharmony_ci			ret = ctx;
110062306a36Sopenharmony_ci	}
110162306a36Sopenharmony_ciout:
110262306a36Sopenharmony_ci	rcu_read_unlock();
110362306a36Sopenharmony_ci	return ret;
110462306a36Sopenharmony_ci}
110562306a36Sopenharmony_ci
110662306a36Sopenharmony_cistatic inline void iocb_destroy(struct aio_kiocb *iocb)
110762306a36Sopenharmony_ci{
110862306a36Sopenharmony_ci	if (iocb->ki_eventfd)
110962306a36Sopenharmony_ci		eventfd_ctx_put(iocb->ki_eventfd);
111062306a36Sopenharmony_ci	if (iocb->ki_filp)
111162306a36Sopenharmony_ci		fput(iocb->ki_filp);
111262306a36Sopenharmony_ci	percpu_ref_put(&iocb->ki_ctx->reqs);
111362306a36Sopenharmony_ci	kmem_cache_free(kiocb_cachep, iocb);
111462306a36Sopenharmony_ci}
111562306a36Sopenharmony_ci
111662306a36Sopenharmony_ci/* aio_complete
111762306a36Sopenharmony_ci *	Called when the io request on the given iocb is complete.
111862306a36Sopenharmony_ci */
111962306a36Sopenharmony_cistatic void aio_complete(struct aio_kiocb *iocb)
112062306a36Sopenharmony_ci{
112162306a36Sopenharmony_ci	struct kioctx	*ctx = iocb->ki_ctx;
112262306a36Sopenharmony_ci	struct aio_ring	*ring;
112362306a36Sopenharmony_ci	struct io_event	*ev_page, *event;
112462306a36Sopenharmony_ci	unsigned tail, pos, head;
112562306a36Sopenharmony_ci	unsigned long	flags;
112662306a36Sopenharmony_ci
112762306a36Sopenharmony_ci	/*
112862306a36Sopenharmony_ci	 * Add a completion event to the ring buffer. Must be done holding
112962306a36Sopenharmony_ci	 * ctx->completion_lock to prevent other code from messing with the tail
113062306a36Sopenharmony_ci	 * pointer since we might be called from irq context.
113162306a36Sopenharmony_ci	 */
113262306a36Sopenharmony_ci	spin_lock_irqsave(&ctx->completion_lock, flags);
113362306a36Sopenharmony_ci
113462306a36Sopenharmony_ci	tail = ctx->tail;
113562306a36Sopenharmony_ci	pos = tail + AIO_EVENTS_OFFSET;
113662306a36Sopenharmony_ci
113762306a36Sopenharmony_ci	if (++tail >= ctx->nr_events)
113862306a36Sopenharmony_ci		tail = 0;
113962306a36Sopenharmony_ci
114062306a36Sopenharmony_ci	ev_page = page_address(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
114162306a36Sopenharmony_ci	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
114262306a36Sopenharmony_ci
114362306a36Sopenharmony_ci	*event = iocb->ki_res;
114462306a36Sopenharmony_ci
114562306a36Sopenharmony_ci	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
114662306a36Sopenharmony_ci
114762306a36Sopenharmony_ci	pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
114862306a36Sopenharmony_ci		 (void __user *)(unsigned long)iocb->ki_res.obj,
114962306a36Sopenharmony_ci		 iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
115062306a36Sopenharmony_ci
115162306a36Sopenharmony_ci	/* after flagging the request as done, we
115262306a36Sopenharmony_ci	 * must never even look at it again
115362306a36Sopenharmony_ci	 */
115462306a36Sopenharmony_ci	smp_wmb();	/* make event visible before updating tail */
115562306a36Sopenharmony_ci
115662306a36Sopenharmony_ci	ctx->tail = tail;
115762306a36Sopenharmony_ci
115862306a36Sopenharmony_ci	ring = page_address(ctx->ring_pages[0]);
115962306a36Sopenharmony_ci	head = ring->head;
116062306a36Sopenharmony_ci	ring->tail = tail;
116162306a36Sopenharmony_ci	flush_dcache_page(ctx->ring_pages[0]);
116262306a36Sopenharmony_ci
116362306a36Sopenharmony_ci	ctx->completed_events++;
116462306a36Sopenharmony_ci	if (ctx->completed_events > 1)
116562306a36Sopenharmony_ci		refill_reqs_available(ctx, head, tail);
116662306a36Sopenharmony_ci	spin_unlock_irqrestore(&ctx->completion_lock, flags);
116762306a36Sopenharmony_ci
116862306a36Sopenharmony_ci	pr_debug("added to ring %p at [%u]\n", iocb, tail);
116962306a36Sopenharmony_ci
117062306a36Sopenharmony_ci	/*
117162306a36Sopenharmony_ci	 * Check if the user asked us to deliver the result through an
117262306a36Sopenharmony_ci	 * eventfd. The eventfd_signal() function is safe to be called
117362306a36Sopenharmony_ci	 * from IRQ context.
117462306a36Sopenharmony_ci	 */
117562306a36Sopenharmony_ci	if (iocb->ki_eventfd)
117662306a36Sopenharmony_ci		eventfd_signal(iocb->ki_eventfd, 1);
117762306a36Sopenharmony_ci
117862306a36Sopenharmony_ci	/*
117962306a36Sopenharmony_ci	 * We have to order our ring_info tail store above and test
118062306a36Sopenharmony_ci	 * of the wait list below outside the wait lock.  This is
118162306a36Sopenharmony_ci	 * like in wake_up_bit() where clearing a bit has to be
118262306a36Sopenharmony_ci	 * ordered with the unlocked test.
118362306a36Sopenharmony_ci	 */
118462306a36Sopenharmony_ci	smp_mb();
118562306a36Sopenharmony_ci
118662306a36Sopenharmony_ci	if (waitqueue_active(&ctx->wait))
118762306a36Sopenharmony_ci		wake_up(&ctx->wait);
118862306a36Sopenharmony_ci}
118962306a36Sopenharmony_ci
119062306a36Sopenharmony_cistatic inline void iocb_put(struct aio_kiocb *iocb)
119162306a36Sopenharmony_ci{
119262306a36Sopenharmony_ci	if (refcount_dec_and_test(&iocb->ki_refcnt)) {
119362306a36Sopenharmony_ci		aio_complete(iocb);
119462306a36Sopenharmony_ci		iocb_destroy(iocb);
119562306a36Sopenharmony_ci	}
119662306a36Sopenharmony_ci}
119762306a36Sopenharmony_ci
119862306a36Sopenharmony_ci/* aio_read_events_ring
119962306a36Sopenharmony_ci *	Pull an event off of the ioctx's event ring.  Returns the number of
120062306a36Sopenharmony_ci *	events fetched
120162306a36Sopenharmony_ci */
120262306a36Sopenharmony_cistatic long aio_read_events_ring(struct kioctx *ctx,
120362306a36Sopenharmony_ci				 struct io_event __user *event, long nr)
120462306a36Sopenharmony_ci{
120562306a36Sopenharmony_ci	struct aio_ring *ring;
120662306a36Sopenharmony_ci	unsigned head, tail, pos;
120762306a36Sopenharmony_ci	long ret = 0;
120862306a36Sopenharmony_ci	int copy_ret;
120962306a36Sopenharmony_ci
121062306a36Sopenharmony_ci	/*
121162306a36Sopenharmony_ci	 * The mutex can block and wake us up and that will cause
121262306a36Sopenharmony_ci	 * wait_event_interruptible_hrtimeout() to schedule without sleeping
121362306a36Sopenharmony_ci	 * and repeat. This should be rare enough that it doesn't cause
121462306a36Sopenharmony_ci	 * peformance issues. See the comment in read_events() for more detail.
121562306a36Sopenharmony_ci	 */
121662306a36Sopenharmony_ci	sched_annotate_sleep();
121762306a36Sopenharmony_ci	mutex_lock(&ctx->ring_lock);
121862306a36Sopenharmony_ci
121962306a36Sopenharmony_ci	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
122062306a36Sopenharmony_ci	ring = page_address(ctx->ring_pages[0]);
122162306a36Sopenharmony_ci	head = ring->head;
122262306a36Sopenharmony_ci	tail = ring->tail;
122362306a36Sopenharmony_ci
122462306a36Sopenharmony_ci	/*
122562306a36Sopenharmony_ci	 * Ensure that once we've read the current tail pointer, that
122662306a36Sopenharmony_ci	 * we also see the events that were stored up to the tail.
122762306a36Sopenharmony_ci	 */
122862306a36Sopenharmony_ci	smp_rmb();
122962306a36Sopenharmony_ci
123062306a36Sopenharmony_ci	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
123162306a36Sopenharmony_ci
123262306a36Sopenharmony_ci	if (head == tail)
123362306a36Sopenharmony_ci		goto out;
123462306a36Sopenharmony_ci
123562306a36Sopenharmony_ci	head %= ctx->nr_events;
123662306a36Sopenharmony_ci	tail %= ctx->nr_events;
123762306a36Sopenharmony_ci
123862306a36Sopenharmony_ci	while (ret < nr) {
123962306a36Sopenharmony_ci		long avail;
124062306a36Sopenharmony_ci		struct io_event *ev;
124162306a36Sopenharmony_ci		struct page *page;
124262306a36Sopenharmony_ci
124362306a36Sopenharmony_ci		avail = (head <= tail ?  tail : ctx->nr_events) - head;
124462306a36Sopenharmony_ci		if (head == tail)
124562306a36Sopenharmony_ci			break;
124662306a36Sopenharmony_ci
124762306a36Sopenharmony_ci		pos = head + AIO_EVENTS_OFFSET;
124862306a36Sopenharmony_ci		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
124962306a36Sopenharmony_ci		pos %= AIO_EVENTS_PER_PAGE;
125062306a36Sopenharmony_ci
125162306a36Sopenharmony_ci		avail = min(avail, nr - ret);
125262306a36Sopenharmony_ci		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
125362306a36Sopenharmony_ci
125462306a36Sopenharmony_ci		ev = page_address(page);
125562306a36Sopenharmony_ci		copy_ret = copy_to_user(event + ret, ev + pos,
125662306a36Sopenharmony_ci					sizeof(*ev) * avail);
125762306a36Sopenharmony_ci
125862306a36Sopenharmony_ci		if (unlikely(copy_ret)) {
125962306a36Sopenharmony_ci			ret = -EFAULT;
126062306a36Sopenharmony_ci			goto out;
126162306a36Sopenharmony_ci		}
126262306a36Sopenharmony_ci
126362306a36Sopenharmony_ci		ret += avail;
126462306a36Sopenharmony_ci		head += avail;
126562306a36Sopenharmony_ci		head %= ctx->nr_events;
126662306a36Sopenharmony_ci	}
126762306a36Sopenharmony_ci
126862306a36Sopenharmony_ci	ring = page_address(ctx->ring_pages[0]);
126962306a36Sopenharmony_ci	ring->head = head;
127062306a36Sopenharmony_ci	flush_dcache_page(ctx->ring_pages[0]);
127162306a36Sopenharmony_ci
127262306a36Sopenharmony_ci	pr_debug("%li  h%u t%u\n", ret, head, tail);
127362306a36Sopenharmony_ciout:
127462306a36Sopenharmony_ci	mutex_unlock(&ctx->ring_lock);
127562306a36Sopenharmony_ci
127662306a36Sopenharmony_ci	return ret;
127762306a36Sopenharmony_ci}
127862306a36Sopenharmony_ci
127962306a36Sopenharmony_cistatic bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
128062306a36Sopenharmony_ci			    struct io_event __user *event, long *i)
128162306a36Sopenharmony_ci{
128262306a36Sopenharmony_ci	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
128362306a36Sopenharmony_ci
128462306a36Sopenharmony_ci	if (ret > 0)
128562306a36Sopenharmony_ci		*i += ret;
128662306a36Sopenharmony_ci
128762306a36Sopenharmony_ci	if (unlikely(atomic_read(&ctx->dead)))
128862306a36Sopenharmony_ci		ret = -EINVAL;
128962306a36Sopenharmony_ci
129062306a36Sopenharmony_ci	if (!*i)
129162306a36Sopenharmony_ci		*i = ret;
129262306a36Sopenharmony_ci
129362306a36Sopenharmony_ci	return ret < 0 || *i >= min_nr;
129462306a36Sopenharmony_ci}
129562306a36Sopenharmony_ci
129662306a36Sopenharmony_cistatic long read_events(struct kioctx *ctx, long min_nr, long nr,
129762306a36Sopenharmony_ci			struct io_event __user *event,
129862306a36Sopenharmony_ci			ktime_t until)
129962306a36Sopenharmony_ci{
130062306a36Sopenharmony_ci	long ret = 0;
130162306a36Sopenharmony_ci
130262306a36Sopenharmony_ci	/*
130362306a36Sopenharmony_ci	 * Note that aio_read_events() is being called as the conditional - i.e.
130462306a36Sopenharmony_ci	 * we're calling it after prepare_to_wait() has set task state to
130562306a36Sopenharmony_ci	 * TASK_INTERRUPTIBLE.
130662306a36Sopenharmony_ci	 *
130762306a36Sopenharmony_ci	 * But aio_read_events() can block, and if it blocks it's going to flip
130862306a36Sopenharmony_ci	 * the task state back to TASK_RUNNING.
130962306a36Sopenharmony_ci	 *
131062306a36Sopenharmony_ci	 * This should be ok, provided it doesn't flip the state back to
131162306a36Sopenharmony_ci	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
131262306a36Sopenharmony_ci	 * will only happen if the mutex_lock() call blocks, and we then find
131362306a36Sopenharmony_ci	 * the ringbuffer empty. So in practice we should be ok, but it's
131462306a36Sopenharmony_ci	 * something to be aware of when touching this code.
131562306a36Sopenharmony_ci	 */
131662306a36Sopenharmony_ci	if (until == 0)
131762306a36Sopenharmony_ci		aio_read_events(ctx, min_nr, nr, event, &ret);
131862306a36Sopenharmony_ci	else
131962306a36Sopenharmony_ci		wait_event_interruptible_hrtimeout(ctx->wait,
132062306a36Sopenharmony_ci				aio_read_events(ctx, min_nr, nr, event, &ret),
132162306a36Sopenharmony_ci				until);
132262306a36Sopenharmony_ci	return ret;
132362306a36Sopenharmony_ci}
132462306a36Sopenharmony_ci
132562306a36Sopenharmony_ci/* sys_io_setup:
132662306a36Sopenharmony_ci *	Create an aio_context capable of receiving at least nr_events.
132762306a36Sopenharmony_ci *	ctxp must not point to an aio_context that already exists, and
132862306a36Sopenharmony_ci *	must be initialized to 0 prior to the call.  On successful
132962306a36Sopenharmony_ci *	creation of the aio_context, *ctxp is filled in with the resulting
133062306a36Sopenharmony_ci *	handle.  May fail with -EINVAL if *ctxp is not initialized,
133162306a36Sopenharmony_ci *	if the specified nr_events exceeds internal limits.  May fail
133262306a36Sopenharmony_ci *	with -EAGAIN if the specified nr_events exceeds the user's limit
133362306a36Sopenharmony_ci *	of available events.  May fail with -ENOMEM if insufficient kernel
133462306a36Sopenharmony_ci *	resources are available.  May fail with -EFAULT if an invalid
133562306a36Sopenharmony_ci *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
133662306a36Sopenharmony_ci *	implemented.
133762306a36Sopenharmony_ci */
133862306a36Sopenharmony_ciSYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
133962306a36Sopenharmony_ci{
134062306a36Sopenharmony_ci	struct kioctx *ioctx = NULL;
134162306a36Sopenharmony_ci	unsigned long ctx;
134262306a36Sopenharmony_ci	long ret;
134362306a36Sopenharmony_ci
134462306a36Sopenharmony_ci	ret = get_user(ctx, ctxp);
134562306a36Sopenharmony_ci	if (unlikely(ret))
134662306a36Sopenharmony_ci		goto out;
134762306a36Sopenharmony_ci
134862306a36Sopenharmony_ci	ret = -EINVAL;
134962306a36Sopenharmony_ci	if (unlikely(ctx || nr_events == 0)) {
135062306a36Sopenharmony_ci		pr_debug("EINVAL: ctx %lu nr_events %u\n",
135162306a36Sopenharmony_ci		         ctx, nr_events);
135262306a36Sopenharmony_ci		goto out;
135362306a36Sopenharmony_ci	}
135462306a36Sopenharmony_ci
135562306a36Sopenharmony_ci	ioctx = ioctx_alloc(nr_events);
135662306a36Sopenharmony_ci	ret = PTR_ERR(ioctx);
135762306a36Sopenharmony_ci	if (!IS_ERR(ioctx)) {
135862306a36Sopenharmony_ci		ret = put_user(ioctx->user_id, ctxp);
135962306a36Sopenharmony_ci		if (ret)
136062306a36Sopenharmony_ci			kill_ioctx(current->mm, ioctx, NULL);
136162306a36Sopenharmony_ci		percpu_ref_put(&ioctx->users);
136262306a36Sopenharmony_ci	}
136362306a36Sopenharmony_ci
136462306a36Sopenharmony_ciout:
136562306a36Sopenharmony_ci	return ret;
136662306a36Sopenharmony_ci}
136762306a36Sopenharmony_ci
136862306a36Sopenharmony_ci#ifdef CONFIG_COMPAT
136962306a36Sopenharmony_ciCOMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
137062306a36Sopenharmony_ci{
137162306a36Sopenharmony_ci	struct kioctx *ioctx = NULL;
137262306a36Sopenharmony_ci	unsigned long ctx;
137362306a36Sopenharmony_ci	long ret;
137462306a36Sopenharmony_ci
137562306a36Sopenharmony_ci	ret = get_user(ctx, ctx32p);
137662306a36Sopenharmony_ci	if (unlikely(ret))
137762306a36Sopenharmony_ci		goto out;
137862306a36Sopenharmony_ci
137962306a36Sopenharmony_ci	ret = -EINVAL;
138062306a36Sopenharmony_ci	if (unlikely(ctx || nr_events == 0)) {
138162306a36Sopenharmony_ci		pr_debug("EINVAL: ctx %lu nr_events %u\n",
138262306a36Sopenharmony_ci		         ctx, nr_events);
138362306a36Sopenharmony_ci		goto out;
138462306a36Sopenharmony_ci	}
138562306a36Sopenharmony_ci
138662306a36Sopenharmony_ci	ioctx = ioctx_alloc(nr_events);
138762306a36Sopenharmony_ci	ret = PTR_ERR(ioctx);
138862306a36Sopenharmony_ci	if (!IS_ERR(ioctx)) {
138962306a36Sopenharmony_ci		/* truncating is ok because it's a user address */
139062306a36Sopenharmony_ci		ret = put_user((u32)ioctx->user_id, ctx32p);
139162306a36Sopenharmony_ci		if (ret)
139262306a36Sopenharmony_ci			kill_ioctx(current->mm, ioctx, NULL);
139362306a36Sopenharmony_ci		percpu_ref_put(&ioctx->users);
139462306a36Sopenharmony_ci	}
139562306a36Sopenharmony_ci
139662306a36Sopenharmony_ciout:
139762306a36Sopenharmony_ci	return ret;
139862306a36Sopenharmony_ci}
139962306a36Sopenharmony_ci#endif
140062306a36Sopenharmony_ci
140162306a36Sopenharmony_ci/* sys_io_destroy:
140262306a36Sopenharmony_ci *	Destroy the aio_context specified.  May cancel any outstanding
140362306a36Sopenharmony_ci *	AIOs and block on completion.  Will fail with -ENOSYS if not
140462306a36Sopenharmony_ci *	implemented.  May fail with -EINVAL if the context pointed to
140562306a36Sopenharmony_ci *	is invalid.
140662306a36Sopenharmony_ci */
140762306a36Sopenharmony_ciSYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
140862306a36Sopenharmony_ci{
140962306a36Sopenharmony_ci	struct kioctx *ioctx = lookup_ioctx(ctx);
141062306a36Sopenharmony_ci	if (likely(NULL != ioctx)) {
141162306a36Sopenharmony_ci		struct ctx_rq_wait wait;
141262306a36Sopenharmony_ci		int ret;
141362306a36Sopenharmony_ci
141462306a36Sopenharmony_ci		init_completion(&wait.comp);
141562306a36Sopenharmony_ci		atomic_set(&wait.count, 1);
141662306a36Sopenharmony_ci
141762306a36Sopenharmony_ci		/* Pass requests_done to kill_ioctx() where it can be set
141862306a36Sopenharmony_ci		 * in a thread-safe way. If we try to set it here then we have
141962306a36Sopenharmony_ci		 * a race condition if two io_destroy() called simultaneously.
142062306a36Sopenharmony_ci		 */
142162306a36Sopenharmony_ci		ret = kill_ioctx(current->mm, ioctx, &wait);
142262306a36Sopenharmony_ci		percpu_ref_put(&ioctx->users);
142362306a36Sopenharmony_ci
142462306a36Sopenharmony_ci		/* Wait until all IO for the context are done. Otherwise kernel
142562306a36Sopenharmony_ci		 * keep using user-space buffers even if user thinks the context
142662306a36Sopenharmony_ci		 * is destroyed.
142762306a36Sopenharmony_ci		 */
142862306a36Sopenharmony_ci		if (!ret)
142962306a36Sopenharmony_ci			wait_for_completion(&wait.comp);
143062306a36Sopenharmony_ci
143162306a36Sopenharmony_ci		return ret;
143262306a36Sopenharmony_ci	}
143362306a36Sopenharmony_ci	pr_debug("EINVAL: invalid context id\n");
143462306a36Sopenharmony_ci	return -EINVAL;
143562306a36Sopenharmony_ci}
143662306a36Sopenharmony_ci
143762306a36Sopenharmony_cistatic void aio_remove_iocb(struct aio_kiocb *iocb)
143862306a36Sopenharmony_ci{
143962306a36Sopenharmony_ci	struct kioctx *ctx = iocb->ki_ctx;
144062306a36Sopenharmony_ci	unsigned long flags;
144162306a36Sopenharmony_ci
144262306a36Sopenharmony_ci	spin_lock_irqsave(&ctx->ctx_lock, flags);
144362306a36Sopenharmony_ci	list_del(&iocb->ki_list);
144462306a36Sopenharmony_ci	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
144562306a36Sopenharmony_ci}
144662306a36Sopenharmony_ci
144762306a36Sopenharmony_cistatic void aio_complete_rw(struct kiocb *kiocb, long res)
144862306a36Sopenharmony_ci{
144962306a36Sopenharmony_ci	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
145062306a36Sopenharmony_ci
145162306a36Sopenharmony_ci	if (!list_empty_careful(&iocb->ki_list))
145262306a36Sopenharmony_ci		aio_remove_iocb(iocb);
145362306a36Sopenharmony_ci
145462306a36Sopenharmony_ci	if (kiocb->ki_flags & IOCB_WRITE) {
145562306a36Sopenharmony_ci		struct inode *inode = file_inode(kiocb->ki_filp);
145662306a36Sopenharmony_ci
145762306a36Sopenharmony_ci		if (S_ISREG(inode->i_mode))
145862306a36Sopenharmony_ci			kiocb_end_write(kiocb);
145962306a36Sopenharmony_ci	}
146062306a36Sopenharmony_ci
146162306a36Sopenharmony_ci	iocb->ki_res.res = res;
146262306a36Sopenharmony_ci	iocb->ki_res.res2 = 0;
146362306a36Sopenharmony_ci	iocb_put(iocb);
146462306a36Sopenharmony_ci}
146562306a36Sopenharmony_ci
146662306a36Sopenharmony_cistatic int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
146762306a36Sopenharmony_ci{
146862306a36Sopenharmony_ci	int ret;
146962306a36Sopenharmony_ci
147062306a36Sopenharmony_ci	req->ki_complete = aio_complete_rw;
147162306a36Sopenharmony_ci	req->private = NULL;
147262306a36Sopenharmony_ci	req->ki_pos = iocb->aio_offset;
147362306a36Sopenharmony_ci	req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
147462306a36Sopenharmony_ci	if (iocb->aio_flags & IOCB_FLAG_RESFD)
147562306a36Sopenharmony_ci		req->ki_flags |= IOCB_EVENTFD;
147662306a36Sopenharmony_ci	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
147762306a36Sopenharmony_ci		/*
147862306a36Sopenharmony_ci		 * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
147962306a36Sopenharmony_ci		 * aio_reqprio is interpreted as an I/O scheduling
148062306a36Sopenharmony_ci		 * class and priority.
148162306a36Sopenharmony_ci		 */
148262306a36Sopenharmony_ci		ret = ioprio_check_cap(iocb->aio_reqprio);
148362306a36Sopenharmony_ci		if (ret) {
148462306a36Sopenharmony_ci			pr_debug("aio ioprio check cap error: %d\n", ret);
148562306a36Sopenharmony_ci			return ret;
148662306a36Sopenharmony_ci		}
148762306a36Sopenharmony_ci
148862306a36Sopenharmony_ci		req->ki_ioprio = iocb->aio_reqprio;
148962306a36Sopenharmony_ci	} else
149062306a36Sopenharmony_ci		req->ki_ioprio = get_current_ioprio();
149162306a36Sopenharmony_ci
149262306a36Sopenharmony_ci	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
149362306a36Sopenharmony_ci	if (unlikely(ret))
149462306a36Sopenharmony_ci		return ret;
149562306a36Sopenharmony_ci
149662306a36Sopenharmony_ci	req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
149762306a36Sopenharmony_ci	return 0;
149862306a36Sopenharmony_ci}
149962306a36Sopenharmony_ci
150062306a36Sopenharmony_cistatic ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
150162306a36Sopenharmony_ci		struct iovec **iovec, bool vectored, bool compat,
150262306a36Sopenharmony_ci		struct iov_iter *iter)
150362306a36Sopenharmony_ci{
150462306a36Sopenharmony_ci	void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
150562306a36Sopenharmony_ci	size_t len = iocb->aio_nbytes;
150662306a36Sopenharmony_ci
150762306a36Sopenharmony_ci	if (!vectored) {
150862306a36Sopenharmony_ci		ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
150962306a36Sopenharmony_ci		*iovec = NULL;
151062306a36Sopenharmony_ci		return ret;
151162306a36Sopenharmony_ci	}
151262306a36Sopenharmony_ci
151362306a36Sopenharmony_ci	return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
151462306a36Sopenharmony_ci}
151562306a36Sopenharmony_ci
151662306a36Sopenharmony_cistatic inline void aio_rw_done(struct kiocb *req, ssize_t ret)
151762306a36Sopenharmony_ci{
151862306a36Sopenharmony_ci	switch (ret) {
151962306a36Sopenharmony_ci	case -EIOCBQUEUED:
152062306a36Sopenharmony_ci		break;
152162306a36Sopenharmony_ci	case -ERESTARTSYS:
152262306a36Sopenharmony_ci	case -ERESTARTNOINTR:
152362306a36Sopenharmony_ci	case -ERESTARTNOHAND:
152462306a36Sopenharmony_ci	case -ERESTART_RESTARTBLOCK:
152562306a36Sopenharmony_ci		/*
152662306a36Sopenharmony_ci		 * There's no easy way to restart the syscall since other AIO's
152762306a36Sopenharmony_ci		 * may be already running. Just fail this IO with EINTR.
152862306a36Sopenharmony_ci		 */
152962306a36Sopenharmony_ci		ret = -EINTR;
153062306a36Sopenharmony_ci		fallthrough;
153162306a36Sopenharmony_ci	default:
153262306a36Sopenharmony_ci		req->ki_complete(req, ret);
153362306a36Sopenharmony_ci	}
153462306a36Sopenharmony_ci}
153562306a36Sopenharmony_ci
153662306a36Sopenharmony_cistatic int aio_read(struct kiocb *req, const struct iocb *iocb,
153762306a36Sopenharmony_ci			bool vectored, bool compat)
153862306a36Sopenharmony_ci{
153962306a36Sopenharmony_ci	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
154062306a36Sopenharmony_ci	struct iov_iter iter;
154162306a36Sopenharmony_ci	struct file *file;
154262306a36Sopenharmony_ci	int ret;
154362306a36Sopenharmony_ci
154462306a36Sopenharmony_ci	ret = aio_prep_rw(req, iocb);
154562306a36Sopenharmony_ci	if (ret)
154662306a36Sopenharmony_ci		return ret;
154762306a36Sopenharmony_ci	file = req->ki_filp;
154862306a36Sopenharmony_ci	if (unlikely(!(file->f_mode & FMODE_READ)))
154962306a36Sopenharmony_ci		return -EBADF;
155062306a36Sopenharmony_ci	if (unlikely(!file->f_op->read_iter))
155162306a36Sopenharmony_ci		return -EINVAL;
155262306a36Sopenharmony_ci
155362306a36Sopenharmony_ci	ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter);
155462306a36Sopenharmony_ci	if (ret < 0)
155562306a36Sopenharmony_ci		return ret;
155662306a36Sopenharmony_ci	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
155762306a36Sopenharmony_ci	if (!ret)
155862306a36Sopenharmony_ci		aio_rw_done(req, call_read_iter(file, req, &iter));
155962306a36Sopenharmony_ci	kfree(iovec);
156062306a36Sopenharmony_ci	return ret;
156162306a36Sopenharmony_ci}
156262306a36Sopenharmony_ci
156362306a36Sopenharmony_cistatic int aio_write(struct kiocb *req, const struct iocb *iocb,
156462306a36Sopenharmony_ci			 bool vectored, bool compat)
156562306a36Sopenharmony_ci{
156662306a36Sopenharmony_ci	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
156762306a36Sopenharmony_ci	struct iov_iter iter;
156862306a36Sopenharmony_ci	struct file *file;
156962306a36Sopenharmony_ci	int ret;
157062306a36Sopenharmony_ci
157162306a36Sopenharmony_ci	ret = aio_prep_rw(req, iocb);
157262306a36Sopenharmony_ci	if (ret)
157362306a36Sopenharmony_ci		return ret;
157462306a36Sopenharmony_ci	file = req->ki_filp;
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_ci	if (unlikely(!(file->f_mode & FMODE_WRITE)))
157762306a36Sopenharmony_ci		return -EBADF;
157862306a36Sopenharmony_ci	if (unlikely(!file->f_op->write_iter))
157962306a36Sopenharmony_ci		return -EINVAL;
158062306a36Sopenharmony_ci
158162306a36Sopenharmony_ci	ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter);
158262306a36Sopenharmony_ci	if (ret < 0)
158362306a36Sopenharmony_ci		return ret;
158462306a36Sopenharmony_ci	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
158562306a36Sopenharmony_ci	if (!ret) {
158662306a36Sopenharmony_ci		if (S_ISREG(file_inode(file)->i_mode))
158762306a36Sopenharmony_ci			kiocb_start_write(req);
158862306a36Sopenharmony_ci		req->ki_flags |= IOCB_WRITE;
158962306a36Sopenharmony_ci		aio_rw_done(req, call_write_iter(file, req, &iter));
159062306a36Sopenharmony_ci	}
159162306a36Sopenharmony_ci	kfree(iovec);
159262306a36Sopenharmony_ci	return ret;
159362306a36Sopenharmony_ci}
159462306a36Sopenharmony_ci
159562306a36Sopenharmony_cistatic void aio_fsync_work(struct work_struct *work)
159662306a36Sopenharmony_ci{
159762306a36Sopenharmony_ci	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
159862306a36Sopenharmony_ci	const struct cred *old_cred = override_creds(iocb->fsync.creds);
159962306a36Sopenharmony_ci
160062306a36Sopenharmony_ci	iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
160162306a36Sopenharmony_ci	revert_creds(old_cred);
160262306a36Sopenharmony_ci	put_cred(iocb->fsync.creds);
160362306a36Sopenharmony_ci	iocb_put(iocb);
160462306a36Sopenharmony_ci}
160562306a36Sopenharmony_ci
160662306a36Sopenharmony_cistatic int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
160762306a36Sopenharmony_ci		     bool datasync)
160862306a36Sopenharmony_ci{
160962306a36Sopenharmony_ci	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
161062306a36Sopenharmony_ci			iocb->aio_rw_flags))
161162306a36Sopenharmony_ci		return -EINVAL;
161262306a36Sopenharmony_ci
161362306a36Sopenharmony_ci	if (unlikely(!req->file->f_op->fsync))
161462306a36Sopenharmony_ci		return -EINVAL;
161562306a36Sopenharmony_ci
161662306a36Sopenharmony_ci	req->creds = prepare_creds();
161762306a36Sopenharmony_ci	if (!req->creds)
161862306a36Sopenharmony_ci		return -ENOMEM;
161962306a36Sopenharmony_ci
162062306a36Sopenharmony_ci	req->datasync = datasync;
162162306a36Sopenharmony_ci	INIT_WORK(&req->work, aio_fsync_work);
162262306a36Sopenharmony_ci	schedule_work(&req->work);
162362306a36Sopenharmony_ci	return 0;
162462306a36Sopenharmony_ci}
162562306a36Sopenharmony_ci
162662306a36Sopenharmony_cistatic void aio_poll_put_work(struct work_struct *work)
162762306a36Sopenharmony_ci{
162862306a36Sopenharmony_ci	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
162962306a36Sopenharmony_ci	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
163062306a36Sopenharmony_ci
163162306a36Sopenharmony_ci	iocb_put(iocb);
163262306a36Sopenharmony_ci}
163362306a36Sopenharmony_ci
163462306a36Sopenharmony_ci/*
163562306a36Sopenharmony_ci * Safely lock the waitqueue which the request is on, synchronizing with the
163662306a36Sopenharmony_ci * case where the ->poll() provider decides to free its waitqueue early.
163762306a36Sopenharmony_ci *
163862306a36Sopenharmony_ci * Returns true on success, meaning that req->head->lock was locked, req->wait
163962306a36Sopenharmony_ci * is on req->head, and an RCU read lock was taken.  Returns false if the
164062306a36Sopenharmony_ci * request was already removed from its waitqueue (which might no longer exist).
164162306a36Sopenharmony_ci */
164262306a36Sopenharmony_cistatic bool poll_iocb_lock_wq(struct poll_iocb *req)
164362306a36Sopenharmony_ci{
164462306a36Sopenharmony_ci	wait_queue_head_t *head;
164562306a36Sopenharmony_ci
164662306a36Sopenharmony_ci	/*
164762306a36Sopenharmony_ci	 * While we hold the waitqueue lock and the waitqueue is nonempty,
164862306a36Sopenharmony_ci	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
164962306a36Sopenharmony_ci	 * lock in the first place can race with the waitqueue being freed.
165062306a36Sopenharmony_ci	 *
165162306a36Sopenharmony_ci	 * We solve this as eventpoll does: by taking advantage of the fact that
165262306a36Sopenharmony_ci	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
165362306a36Sopenharmony_ci	 * we enter rcu_read_lock() and see that the pointer to the queue is
165462306a36Sopenharmony_ci	 * non-NULL, we can then lock it without the memory being freed out from
165562306a36Sopenharmony_ci	 * under us, then check whether the request is still on the queue.
165662306a36Sopenharmony_ci	 *
165762306a36Sopenharmony_ci	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
165862306a36Sopenharmony_ci	 * case the caller deletes the entry from the queue, leaving it empty.
165962306a36Sopenharmony_ci	 * In that case, only RCU prevents the queue memory from being freed.
166062306a36Sopenharmony_ci	 */
166162306a36Sopenharmony_ci	rcu_read_lock();
166262306a36Sopenharmony_ci	head = smp_load_acquire(&req->head);
166362306a36Sopenharmony_ci	if (head) {
166462306a36Sopenharmony_ci		spin_lock(&head->lock);
166562306a36Sopenharmony_ci		if (!list_empty(&req->wait.entry))
166662306a36Sopenharmony_ci			return true;
166762306a36Sopenharmony_ci		spin_unlock(&head->lock);
166862306a36Sopenharmony_ci	}
166962306a36Sopenharmony_ci	rcu_read_unlock();
167062306a36Sopenharmony_ci	return false;
167162306a36Sopenharmony_ci}
167262306a36Sopenharmony_ci
167362306a36Sopenharmony_cistatic void poll_iocb_unlock_wq(struct poll_iocb *req)
167462306a36Sopenharmony_ci{
167562306a36Sopenharmony_ci	spin_unlock(&req->head->lock);
167662306a36Sopenharmony_ci	rcu_read_unlock();
167762306a36Sopenharmony_ci}
167862306a36Sopenharmony_ci
167962306a36Sopenharmony_cistatic void aio_poll_complete_work(struct work_struct *work)
168062306a36Sopenharmony_ci{
168162306a36Sopenharmony_ci	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
168262306a36Sopenharmony_ci	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
168362306a36Sopenharmony_ci	struct poll_table_struct pt = { ._key = req->events };
168462306a36Sopenharmony_ci	struct kioctx *ctx = iocb->ki_ctx;
168562306a36Sopenharmony_ci	__poll_t mask = 0;
168662306a36Sopenharmony_ci
168762306a36Sopenharmony_ci	if (!READ_ONCE(req->cancelled))
168862306a36Sopenharmony_ci		mask = vfs_poll(req->file, &pt) & req->events;
168962306a36Sopenharmony_ci
169062306a36Sopenharmony_ci	/*
169162306a36Sopenharmony_ci	 * Note that ->ki_cancel callers also delete iocb from active_reqs after
169262306a36Sopenharmony_ci	 * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
169362306a36Sopenharmony_ci	 * synchronize with them.  In the cancellation case the list_del_init
169462306a36Sopenharmony_ci	 * itself is not actually needed, but harmless so we keep it in to
169562306a36Sopenharmony_ci	 * avoid further branches in the fast path.
169662306a36Sopenharmony_ci	 */
169762306a36Sopenharmony_ci	spin_lock_irq(&ctx->ctx_lock);
169862306a36Sopenharmony_ci	if (poll_iocb_lock_wq(req)) {
169962306a36Sopenharmony_ci		if (!mask && !READ_ONCE(req->cancelled)) {
170062306a36Sopenharmony_ci			/*
170162306a36Sopenharmony_ci			 * The request isn't actually ready to be completed yet.
170262306a36Sopenharmony_ci			 * Reschedule completion if another wakeup came in.
170362306a36Sopenharmony_ci			 */
170462306a36Sopenharmony_ci			if (req->work_need_resched) {
170562306a36Sopenharmony_ci				schedule_work(&req->work);
170662306a36Sopenharmony_ci				req->work_need_resched = false;
170762306a36Sopenharmony_ci			} else {
170862306a36Sopenharmony_ci				req->work_scheduled = false;
170962306a36Sopenharmony_ci			}
171062306a36Sopenharmony_ci			poll_iocb_unlock_wq(req);
171162306a36Sopenharmony_ci			spin_unlock_irq(&ctx->ctx_lock);
171262306a36Sopenharmony_ci			return;
171362306a36Sopenharmony_ci		}
171462306a36Sopenharmony_ci		list_del_init(&req->wait.entry);
171562306a36Sopenharmony_ci		poll_iocb_unlock_wq(req);
171662306a36Sopenharmony_ci	} /* else, POLLFREE has freed the waitqueue, so we must complete */
171762306a36Sopenharmony_ci	list_del_init(&iocb->ki_list);
171862306a36Sopenharmony_ci	iocb->ki_res.res = mangle_poll(mask);
171962306a36Sopenharmony_ci	spin_unlock_irq(&ctx->ctx_lock);
172062306a36Sopenharmony_ci
172162306a36Sopenharmony_ci	iocb_put(iocb);
172262306a36Sopenharmony_ci}
172362306a36Sopenharmony_ci
172462306a36Sopenharmony_ci/* assumes we are called with irqs disabled */
172562306a36Sopenharmony_cistatic int aio_poll_cancel(struct kiocb *iocb)
172662306a36Sopenharmony_ci{
172762306a36Sopenharmony_ci	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
172862306a36Sopenharmony_ci	struct poll_iocb *req = &aiocb->poll;
172962306a36Sopenharmony_ci
173062306a36Sopenharmony_ci	if (poll_iocb_lock_wq(req)) {
173162306a36Sopenharmony_ci		WRITE_ONCE(req->cancelled, true);
173262306a36Sopenharmony_ci		if (!req->work_scheduled) {
173362306a36Sopenharmony_ci			schedule_work(&aiocb->poll.work);
173462306a36Sopenharmony_ci			req->work_scheduled = true;
173562306a36Sopenharmony_ci		}
173662306a36Sopenharmony_ci		poll_iocb_unlock_wq(req);
173762306a36Sopenharmony_ci	} /* else, the request was force-cancelled by POLLFREE already */
173862306a36Sopenharmony_ci
173962306a36Sopenharmony_ci	return 0;
174062306a36Sopenharmony_ci}
174162306a36Sopenharmony_ci
174262306a36Sopenharmony_cistatic int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
174362306a36Sopenharmony_ci		void *key)
174462306a36Sopenharmony_ci{
174562306a36Sopenharmony_ci	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
174662306a36Sopenharmony_ci	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
174762306a36Sopenharmony_ci	__poll_t mask = key_to_poll(key);
174862306a36Sopenharmony_ci	unsigned long flags;
174962306a36Sopenharmony_ci
175062306a36Sopenharmony_ci	/* for instances that support it check for an event match first: */
175162306a36Sopenharmony_ci	if (mask && !(mask & req->events))
175262306a36Sopenharmony_ci		return 0;
175362306a36Sopenharmony_ci
175462306a36Sopenharmony_ci	/*
175562306a36Sopenharmony_ci	 * Complete the request inline if possible.  This requires that three
175662306a36Sopenharmony_ci	 * conditions be met:
175762306a36Sopenharmony_ci	 *   1. An event mask must have been passed.  If a plain wakeup was done
175862306a36Sopenharmony_ci	 *	instead, then mask == 0 and we have to call vfs_poll() to get
175962306a36Sopenharmony_ci	 *	the events, so inline completion isn't possible.
176062306a36Sopenharmony_ci	 *   2. The completion work must not have already been scheduled.
176162306a36Sopenharmony_ci	 *   3. ctx_lock must not be busy.  We have to use trylock because we
176262306a36Sopenharmony_ci	 *	already hold the waitqueue lock, so this inverts the normal
176362306a36Sopenharmony_ci	 *	locking order.  Use irqsave/irqrestore because not all
176462306a36Sopenharmony_ci	 *	filesystems (e.g. fuse) call this function with IRQs disabled,
176562306a36Sopenharmony_ci	 *	yet IRQs have to be disabled before ctx_lock is obtained.
176662306a36Sopenharmony_ci	 */
176762306a36Sopenharmony_ci	if (mask && !req->work_scheduled &&
176862306a36Sopenharmony_ci	    spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
176962306a36Sopenharmony_ci		struct kioctx *ctx = iocb->ki_ctx;
177062306a36Sopenharmony_ci
177162306a36Sopenharmony_ci		list_del_init(&req->wait.entry);
177262306a36Sopenharmony_ci		list_del(&iocb->ki_list);
177362306a36Sopenharmony_ci		iocb->ki_res.res = mangle_poll(mask);
177462306a36Sopenharmony_ci		if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
177562306a36Sopenharmony_ci			iocb = NULL;
177662306a36Sopenharmony_ci			INIT_WORK(&req->work, aio_poll_put_work);
177762306a36Sopenharmony_ci			schedule_work(&req->work);
177862306a36Sopenharmony_ci		}
177962306a36Sopenharmony_ci		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
178062306a36Sopenharmony_ci		if (iocb)
178162306a36Sopenharmony_ci			iocb_put(iocb);
178262306a36Sopenharmony_ci	} else {
178362306a36Sopenharmony_ci		/*
178462306a36Sopenharmony_ci		 * Schedule the completion work if needed.  If it was already
178562306a36Sopenharmony_ci		 * scheduled, record that another wakeup came in.
178662306a36Sopenharmony_ci		 *
178762306a36Sopenharmony_ci		 * Don't remove the request from the waitqueue here, as it might
178862306a36Sopenharmony_ci		 * not actually be complete yet (we won't know until vfs_poll()
178962306a36Sopenharmony_ci		 * is called), and we must not miss any wakeups.  POLLFREE is an
179062306a36Sopenharmony_ci		 * exception to this; see below.
179162306a36Sopenharmony_ci		 */
179262306a36Sopenharmony_ci		if (req->work_scheduled) {
179362306a36Sopenharmony_ci			req->work_need_resched = true;
179462306a36Sopenharmony_ci		} else {
179562306a36Sopenharmony_ci			schedule_work(&req->work);
179662306a36Sopenharmony_ci			req->work_scheduled = true;
179762306a36Sopenharmony_ci		}
179862306a36Sopenharmony_ci
179962306a36Sopenharmony_ci		/*
180062306a36Sopenharmony_ci		 * If the waitqueue is being freed early but we can't complete
180162306a36Sopenharmony_ci		 * the request inline, we have to tear down the request as best
180262306a36Sopenharmony_ci		 * we can.  That means immediately removing the request from its
180362306a36Sopenharmony_ci		 * waitqueue and preventing all further accesses to the
180462306a36Sopenharmony_ci		 * waitqueue via the request.  We also need to schedule the
180562306a36Sopenharmony_ci		 * completion work (done above).  Also mark the request as
180662306a36Sopenharmony_ci		 * cancelled, to potentially skip an unneeded call to ->poll().
180762306a36Sopenharmony_ci		 */
180862306a36Sopenharmony_ci		if (mask & POLLFREE) {
180962306a36Sopenharmony_ci			WRITE_ONCE(req->cancelled, true);
181062306a36Sopenharmony_ci			list_del_init(&req->wait.entry);
181162306a36Sopenharmony_ci
181262306a36Sopenharmony_ci			/*
181362306a36Sopenharmony_ci			 * Careful: this *must* be the last step, since as soon
181462306a36Sopenharmony_ci			 * as req->head is NULL'ed out, the request can be
181562306a36Sopenharmony_ci			 * completed and freed, since aio_poll_complete_work()
181662306a36Sopenharmony_ci			 * will no longer need to take the waitqueue lock.
181762306a36Sopenharmony_ci			 */
181862306a36Sopenharmony_ci			smp_store_release(&req->head, NULL);
181962306a36Sopenharmony_ci		}
182062306a36Sopenharmony_ci	}
182162306a36Sopenharmony_ci	return 1;
182262306a36Sopenharmony_ci}
182362306a36Sopenharmony_ci
182462306a36Sopenharmony_cistruct aio_poll_table {
182562306a36Sopenharmony_ci	struct poll_table_struct	pt;
182662306a36Sopenharmony_ci	struct aio_kiocb		*iocb;
182762306a36Sopenharmony_ci	bool				queued;
182862306a36Sopenharmony_ci	int				error;
182962306a36Sopenharmony_ci};
183062306a36Sopenharmony_ci
183162306a36Sopenharmony_cistatic void
183262306a36Sopenharmony_ciaio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
183362306a36Sopenharmony_ci		struct poll_table_struct *p)
183462306a36Sopenharmony_ci{
183562306a36Sopenharmony_ci	struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
183662306a36Sopenharmony_ci
183762306a36Sopenharmony_ci	/* multiple wait queues per file are not supported */
183862306a36Sopenharmony_ci	if (unlikely(pt->queued)) {
183962306a36Sopenharmony_ci		pt->error = -EINVAL;
184062306a36Sopenharmony_ci		return;
184162306a36Sopenharmony_ci	}
184262306a36Sopenharmony_ci
184362306a36Sopenharmony_ci	pt->queued = true;
184462306a36Sopenharmony_ci	pt->error = 0;
184562306a36Sopenharmony_ci	pt->iocb->poll.head = head;
184662306a36Sopenharmony_ci	add_wait_queue(head, &pt->iocb->poll.wait);
184762306a36Sopenharmony_ci}
184862306a36Sopenharmony_ci
184962306a36Sopenharmony_cistatic int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
185062306a36Sopenharmony_ci{
185162306a36Sopenharmony_ci	struct kioctx *ctx = aiocb->ki_ctx;
185262306a36Sopenharmony_ci	struct poll_iocb *req = &aiocb->poll;
185362306a36Sopenharmony_ci	struct aio_poll_table apt;
185462306a36Sopenharmony_ci	bool cancel = false;
185562306a36Sopenharmony_ci	__poll_t mask;
185662306a36Sopenharmony_ci
185762306a36Sopenharmony_ci	/* reject any unknown events outside the normal event mask. */
185862306a36Sopenharmony_ci	if ((u16)iocb->aio_buf != iocb->aio_buf)
185962306a36Sopenharmony_ci		return -EINVAL;
186062306a36Sopenharmony_ci	/* reject fields that are not defined for poll */
186162306a36Sopenharmony_ci	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
186262306a36Sopenharmony_ci		return -EINVAL;
186362306a36Sopenharmony_ci
186462306a36Sopenharmony_ci	INIT_WORK(&req->work, aio_poll_complete_work);
186562306a36Sopenharmony_ci	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
186662306a36Sopenharmony_ci
186762306a36Sopenharmony_ci	req->head = NULL;
186862306a36Sopenharmony_ci	req->cancelled = false;
186962306a36Sopenharmony_ci	req->work_scheduled = false;
187062306a36Sopenharmony_ci	req->work_need_resched = false;
187162306a36Sopenharmony_ci
187262306a36Sopenharmony_ci	apt.pt._qproc = aio_poll_queue_proc;
187362306a36Sopenharmony_ci	apt.pt._key = req->events;
187462306a36Sopenharmony_ci	apt.iocb = aiocb;
187562306a36Sopenharmony_ci	apt.queued = false;
187662306a36Sopenharmony_ci	apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
187762306a36Sopenharmony_ci
187862306a36Sopenharmony_ci	/* initialized the list so that we can do list_empty checks */
187962306a36Sopenharmony_ci	INIT_LIST_HEAD(&req->wait.entry);
188062306a36Sopenharmony_ci	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
188162306a36Sopenharmony_ci
188262306a36Sopenharmony_ci	mask = vfs_poll(req->file, &apt.pt) & req->events;
188362306a36Sopenharmony_ci	spin_lock_irq(&ctx->ctx_lock);
188462306a36Sopenharmony_ci	if (likely(apt.queued)) {
188562306a36Sopenharmony_ci		bool on_queue = poll_iocb_lock_wq(req);
188662306a36Sopenharmony_ci
188762306a36Sopenharmony_ci		if (!on_queue || req->work_scheduled) {
188862306a36Sopenharmony_ci			/*
188962306a36Sopenharmony_ci			 * aio_poll_wake() already either scheduled the async
189062306a36Sopenharmony_ci			 * completion work, or completed the request inline.
189162306a36Sopenharmony_ci			 */
189262306a36Sopenharmony_ci			if (apt.error) /* unsupported case: multiple queues */
189362306a36Sopenharmony_ci				cancel = true;
189462306a36Sopenharmony_ci			apt.error = 0;
189562306a36Sopenharmony_ci			mask = 0;
189662306a36Sopenharmony_ci		}
189762306a36Sopenharmony_ci		if (mask || apt.error) {
189862306a36Sopenharmony_ci			/* Steal to complete synchronously. */
189962306a36Sopenharmony_ci			list_del_init(&req->wait.entry);
190062306a36Sopenharmony_ci		} else if (cancel) {
190162306a36Sopenharmony_ci			/* Cancel if possible (may be too late though). */
190262306a36Sopenharmony_ci			WRITE_ONCE(req->cancelled, true);
190362306a36Sopenharmony_ci		} else if (on_queue) {
190462306a36Sopenharmony_ci			/*
190562306a36Sopenharmony_ci			 * Actually waiting for an event, so add the request to
190662306a36Sopenharmony_ci			 * active_reqs so that it can be cancelled if needed.
190762306a36Sopenharmony_ci			 */
190862306a36Sopenharmony_ci			list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
190962306a36Sopenharmony_ci			aiocb->ki_cancel = aio_poll_cancel;
191062306a36Sopenharmony_ci		}
191162306a36Sopenharmony_ci		if (on_queue)
191262306a36Sopenharmony_ci			poll_iocb_unlock_wq(req);
191362306a36Sopenharmony_ci	}
191462306a36Sopenharmony_ci	if (mask) { /* no async, we'd stolen it */
191562306a36Sopenharmony_ci		aiocb->ki_res.res = mangle_poll(mask);
191662306a36Sopenharmony_ci		apt.error = 0;
191762306a36Sopenharmony_ci	}
191862306a36Sopenharmony_ci	spin_unlock_irq(&ctx->ctx_lock);
191962306a36Sopenharmony_ci	if (mask)
192062306a36Sopenharmony_ci		iocb_put(aiocb);
192162306a36Sopenharmony_ci	return apt.error;
192262306a36Sopenharmony_ci}
192362306a36Sopenharmony_ci
192462306a36Sopenharmony_cistatic int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
192562306a36Sopenharmony_ci			   struct iocb __user *user_iocb, struct aio_kiocb *req,
192662306a36Sopenharmony_ci			   bool compat)
192762306a36Sopenharmony_ci{
192862306a36Sopenharmony_ci	req->ki_filp = fget(iocb->aio_fildes);
192962306a36Sopenharmony_ci	if (unlikely(!req->ki_filp))
193062306a36Sopenharmony_ci		return -EBADF;
193162306a36Sopenharmony_ci
193262306a36Sopenharmony_ci	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
193362306a36Sopenharmony_ci		struct eventfd_ctx *eventfd;
193462306a36Sopenharmony_ci		/*
193562306a36Sopenharmony_ci		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
193662306a36Sopenharmony_ci		 * instance of the file* now. The file descriptor must be
193762306a36Sopenharmony_ci		 * an eventfd() fd, and will be signaled for each completed
193862306a36Sopenharmony_ci		 * event using the eventfd_signal() function.
193962306a36Sopenharmony_ci		 */
194062306a36Sopenharmony_ci		eventfd = eventfd_ctx_fdget(iocb->aio_resfd);
194162306a36Sopenharmony_ci		if (IS_ERR(eventfd))
194262306a36Sopenharmony_ci			return PTR_ERR(eventfd);
194362306a36Sopenharmony_ci
194462306a36Sopenharmony_ci		req->ki_eventfd = eventfd;
194562306a36Sopenharmony_ci	}
194662306a36Sopenharmony_ci
194762306a36Sopenharmony_ci	if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
194862306a36Sopenharmony_ci		pr_debug("EFAULT: aio_key\n");
194962306a36Sopenharmony_ci		return -EFAULT;
195062306a36Sopenharmony_ci	}
195162306a36Sopenharmony_ci
195262306a36Sopenharmony_ci	req->ki_res.obj = (u64)(unsigned long)user_iocb;
195362306a36Sopenharmony_ci	req->ki_res.data = iocb->aio_data;
195462306a36Sopenharmony_ci	req->ki_res.res = 0;
195562306a36Sopenharmony_ci	req->ki_res.res2 = 0;
195662306a36Sopenharmony_ci
195762306a36Sopenharmony_ci	switch (iocb->aio_lio_opcode) {
195862306a36Sopenharmony_ci	case IOCB_CMD_PREAD:
195962306a36Sopenharmony_ci		return aio_read(&req->rw, iocb, false, compat);
196062306a36Sopenharmony_ci	case IOCB_CMD_PWRITE:
196162306a36Sopenharmony_ci		return aio_write(&req->rw, iocb, false, compat);
196262306a36Sopenharmony_ci	case IOCB_CMD_PREADV:
196362306a36Sopenharmony_ci		return aio_read(&req->rw, iocb, true, compat);
196462306a36Sopenharmony_ci	case IOCB_CMD_PWRITEV:
196562306a36Sopenharmony_ci		return aio_write(&req->rw, iocb, true, compat);
196662306a36Sopenharmony_ci	case IOCB_CMD_FSYNC:
196762306a36Sopenharmony_ci		return aio_fsync(&req->fsync, iocb, false);
196862306a36Sopenharmony_ci	case IOCB_CMD_FDSYNC:
196962306a36Sopenharmony_ci		return aio_fsync(&req->fsync, iocb, true);
197062306a36Sopenharmony_ci	case IOCB_CMD_POLL:
197162306a36Sopenharmony_ci		return aio_poll(req, iocb);
197262306a36Sopenharmony_ci	default:
197362306a36Sopenharmony_ci		pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
197462306a36Sopenharmony_ci		return -EINVAL;
197562306a36Sopenharmony_ci	}
197662306a36Sopenharmony_ci}
197762306a36Sopenharmony_ci
197862306a36Sopenharmony_cistatic int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
197962306a36Sopenharmony_ci			 bool compat)
198062306a36Sopenharmony_ci{
198162306a36Sopenharmony_ci	struct aio_kiocb *req;
198262306a36Sopenharmony_ci	struct iocb iocb;
198362306a36Sopenharmony_ci	int err;
198462306a36Sopenharmony_ci
198562306a36Sopenharmony_ci	if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
198662306a36Sopenharmony_ci		return -EFAULT;
198762306a36Sopenharmony_ci
198862306a36Sopenharmony_ci	/* enforce forwards compatibility on users */
198962306a36Sopenharmony_ci	if (unlikely(iocb.aio_reserved2)) {
199062306a36Sopenharmony_ci		pr_debug("EINVAL: reserve field set\n");
199162306a36Sopenharmony_ci		return -EINVAL;
199262306a36Sopenharmony_ci	}
199362306a36Sopenharmony_ci
199462306a36Sopenharmony_ci	/* prevent overflows */
199562306a36Sopenharmony_ci	if (unlikely(
199662306a36Sopenharmony_ci	    (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
199762306a36Sopenharmony_ci	    (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
199862306a36Sopenharmony_ci	    ((ssize_t)iocb.aio_nbytes < 0)
199962306a36Sopenharmony_ci	   )) {
200062306a36Sopenharmony_ci		pr_debug("EINVAL: overflow check\n");
200162306a36Sopenharmony_ci		return -EINVAL;
200262306a36Sopenharmony_ci	}
200362306a36Sopenharmony_ci
200462306a36Sopenharmony_ci	req = aio_get_req(ctx);
200562306a36Sopenharmony_ci	if (unlikely(!req))
200662306a36Sopenharmony_ci		return -EAGAIN;
200762306a36Sopenharmony_ci
200862306a36Sopenharmony_ci	err = __io_submit_one(ctx, &iocb, user_iocb, req, compat);
200962306a36Sopenharmony_ci
201062306a36Sopenharmony_ci	/* Done with the synchronous reference */
201162306a36Sopenharmony_ci	iocb_put(req);
201262306a36Sopenharmony_ci
201362306a36Sopenharmony_ci	/*
201462306a36Sopenharmony_ci	 * If err is 0, we'd either done aio_complete() ourselves or have
201562306a36Sopenharmony_ci	 * arranged for that to be done asynchronously.  Anything non-zero
201662306a36Sopenharmony_ci	 * means that we need to destroy req ourselves.
201762306a36Sopenharmony_ci	 */
201862306a36Sopenharmony_ci	if (unlikely(err)) {
201962306a36Sopenharmony_ci		iocb_destroy(req);
202062306a36Sopenharmony_ci		put_reqs_available(ctx, 1);
202162306a36Sopenharmony_ci	}
202262306a36Sopenharmony_ci	return err;
202362306a36Sopenharmony_ci}
202462306a36Sopenharmony_ci
202562306a36Sopenharmony_ci/* sys_io_submit:
202662306a36Sopenharmony_ci *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
202762306a36Sopenharmony_ci *	the number of iocbs queued.  May return -EINVAL if the aio_context
202862306a36Sopenharmony_ci *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
202962306a36Sopenharmony_ci *	*iocbpp[0] is not properly initialized, if the operation specified
203062306a36Sopenharmony_ci *	is invalid for the file descriptor in the iocb.  May fail with
203162306a36Sopenharmony_ci *	-EFAULT if any of the data structures point to invalid data.  May
203262306a36Sopenharmony_ci *	fail with -EBADF if the file descriptor specified in the first
203362306a36Sopenharmony_ci *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
203462306a36Sopenharmony_ci *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
203562306a36Sopenharmony_ci *	fail with -ENOSYS if not implemented.
203662306a36Sopenharmony_ci */
203762306a36Sopenharmony_ciSYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
203862306a36Sopenharmony_ci		struct iocb __user * __user *, iocbpp)
203962306a36Sopenharmony_ci{
204062306a36Sopenharmony_ci	struct kioctx *ctx;
204162306a36Sopenharmony_ci	long ret = 0;
204262306a36Sopenharmony_ci	int i = 0;
204362306a36Sopenharmony_ci	struct blk_plug plug;
204462306a36Sopenharmony_ci
204562306a36Sopenharmony_ci	if (unlikely(nr < 0))
204662306a36Sopenharmony_ci		return -EINVAL;
204762306a36Sopenharmony_ci
204862306a36Sopenharmony_ci	ctx = lookup_ioctx(ctx_id);
204962306a36Sopenharmony_ci	if (unlikely(!ctx)) {
205062306a36Sopenharmony_ci		pr_debug("EINVAL: invalid context id\n");
205162306a36Sopenharmony_ci		return -EINVAL;
205262306a36Sopenharmony_ci	}
205362306a36Sopenharmony_ci
205462306a36Sopenharmony_ci	if (nr > ctx->nr_events)
205562306a36Sopenharmony_ci		nr = ctx->nr_events;
205662306a36Sopenharmony_ci
205762306a36Sopenharmony_ci	if (nr > AIO_PLUG_THRESHOLD)
205862306a36Sopenharmony_ci		blk_start_plug(&plug);
205962306a36Sopenharmony_ci	for (i = 0; i < nr; i++) {
206062306a36Sopenharmony_ci		struct iocb __user *user_iocb;
206162306a36Sopenharmony_ci
206262306a36Sopenharmony_ci		if (unlikely(get_user(user_iocb, iocbpp + i))) {
206362306a36Sopenharmony_ci			ret = -EFAULT;
206462306a36Sopenharmony_ci			break;
206562306a36Sopenharmony_ci		}
206662306a36Sopenharmony_ci
206762306a36Sopenharmony_ci		ret = io_submit_one(ctx, user_iocb, false);
206862306a36Sopenharmony_ci		if (ret)
206962306a36Sopenharmony_ci			break;
207062306a36Sopenharmony_ci	}
207162306a36Sopenharmony_ci	if (nr > AIO_PLUG_THRESHOLD)
207262306a36Sopenharmony_ci		blk_finish_plug(&plug);
207362306a36Sopenharmony_ci
207462306a36Sopenharmony_ci	percpu_ref_put(&ctx->users);
207562306a36Sopenharmony_ci	return i ? i : ret;
207662306a36Sopenharmony_ci}
207762306a36Sopenharmony_ci
207862306a36Sopenharmony_ci#ifdef CONFIG_COMPAT
207962306a36Sopenharmony_ciCOMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
208062306a36Sopenharmony_ci		       int, nr, compat_uptr_t __user *, iocbpp)
208162306a36Sopenharmony_ci{
208262306a36Sopenharmony_ci	struct kioctx *ctx;
208362306a36Sopenharmony_ci	long ret = 0;
208462306a36Sopenharmony_ci	int i = 0;
208562306a36Sopenharmony_ci	struct blk_plug plug;
208662306a36Sopenharmony_ci
208762306a36Sopenharmony_ci	if (unlikely(nr < 0))
208862306a36Sopenharmony_ci		return -EINVAL;
208962306a36Sopenharmony_ci
209062306a36Sopenharmony_ci	ctx = lookup_ioctx(ctx_id);
209162306a36Sopenharmony_ci	if (unlikely(!ctx)) {
209262306a36Sopenharmony_ci		pr_debug("EINVAL: invalid context id\n");
209362306a36Sopenharmony_ci		return -EINVAL;
209462306a36Sopenharmony_ci	}
209562306a36Sopenharmony_ci
209662306a36Sopenharmony_ci	if (nr > ctx->nr_events)
209762306a36Sopenharmony_ci		nr = ctx->nr_events;
209862306a36Sopenharmony_ci
209962306a36Sopenharmony_ci	if (nr > AIO_PLUG_THRESHOLD)
210062306a36Sopenharmony_ci		blk_start_plug(&plug);
210162306a36Sopenharmony_ci	for (i = 0; i < nr; i++) {
210262306a36Sopenharmony_ci		compat_uptr_t user_iocb;
210362306a36Sopenharmony_ci
210462306a36Sopenharmony_ci		if (unlikely(get_user(user_iocb, iocbpp + i))) {
210562306a36Sopenharmony_ci			ret = -EFAULT;
210662306a36Sopenharmony_ci			break;
210762306a36Sopenharmony_ci		}
210862306a36Sopenharmony_ci
210962306a36Sopenharmony_ci		ret = io_submit_one(ctx, compat_ptr(user_iocb), true);
211062306a36Sopenharmony_ci		if (ret)
211162306a36Sopenharmony_ci			break;
211262306a36Sopenharmony_ci	}
211362306a36Sopenharmony_ci	if (nr > AIO_PLUG_THRESHOLD)
211462306a36Sopenharmony_ci		blk_finish_plug(&plug);
211562306a36Sopenharmony_ci
211662306a36Sopenharmony_ci	percpu_ref_put(&ctx->users);
211762306a36Sopenharmony_ci	return i ? i : ret;
211862306a36Sopenharmony_ci}
211962306a36Sopenharmony_ci#endif
212062306a36Sopenharmony_ci
212162306a36Sopenharmony_ci/* sys_io_cancel:
212262306a36Sopenharmony_ci *	Attempts to cancel an iocb previously passed to io_submit.  If
212362306a36Sopenharmony_ci *	the operation is successfully cancelled, the resulting event is
212462306a36Sopenharmony_ci *	copied into the memory pointed to by result without being placed
212562306a36Sopenharmony_ci *	into the completion queue and 0 is returned.  May fail with
212662306a36Sopenharmony_ci *	-EFAULT if any of the data structures pointed to are invalid.
212762306a36Sopenharmony_ci *	May fail with -EINVAL if aio_context specified by ctx_id is
212862306a36Sopenharmony_ci *	invalid.  May fail with -EAGAIN if the iocb specified was not
212962306a36Sopenharmony_ci *	cancelled.  Will fail with -ENOSYS if not implemented.
213062306a36Sopenharmony_ci */
213162306a36Sopenharmony_ciSYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
213262306a36Sopenharmony_ci		struct io_event __user *, result)
213362306a36Sopenharmony_ci{
213462306a36Sopenharmony_ci	struct kioctx *ctx;
213562306a36Sopenharmony_ci	struct aio_kiocb *kiocb;
213662306a36Sopenharmony_ci	int ret = -EINVAL;
213762306a36Sopenharmony_ci	u32 key;
213862306a36Sopenharmony_ci	u64 obj = (u64)(unsigned long)iocb;
213962306a36Sopenharmony_ci
214062306a36Sopenharmony_ci	if (unlikely(get_user(key, &iocb->aio_key)))
214162306a36Sopenharmony_ci		return -EFAULT;
214262306a36Sopenharmony_ci	if (unlikely(key != KIOCB_KEY))
214362306a36Sopenharmony_ci		return -EINVAL;
214462306a36Sopenharmony_ci
214562306a36Sopenharmony_ci	ctx = lookup_ioctx(ctx_id);
214662306a36Sopenharmony_ci	if (unlikely(!ctx))
214762306a36Sopenharmony_ci		return -EINVAL;
214862306a36Sopenharmony_ci
214962306a36Sopenharmony_ci	spin_lock_irq(&ctx->ctx_lock);
215062306a36Sopenharmony_ci	/* TODO: use a hash or array, this sucks. */
215162306a36Sopenharmony_ci	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
215262306a36Sopenharmony_ci		if (kiocb->ki_res.obj == obj) {
215362306a36Sopenharmony_ci			ret = kiocb->ki_cancel(&kiocb->rw);
215462306a36Sopenharmony_ci			list_del_init(&kiocb->ki_list);
215562306a36Sopenharmony_ci			break;
215662306a36Sopenharmony_ci		}
215762306a36Sopenharmony_ci	}
215862306a36Sopenharmony_ci	spin_unlock_irq(&ctx->ctx_lock);
215962306a36Sopenharmony_ci
216062306a36Sopenharmony_ci	if (!ret) {
216162306a36Sopenharmony_ci		/*
216262306a36Sopenharmony_ci		 * The result argument is no longer used - the io_event is
216362306a36Sopenharmony_ci		 * always delivered via the ring buffer. -EINPROGRESS indicates
216462306a36Sopenharmony_ci		 * cancellation is progress:
216562306a36Sopenharmony_ci		 */
216662306a36Sopenharmony_ci		ret = -EINPROGRESS;
216762306a36Sopenharmony_ci	}
216862306a36Sopenharmony_ci
216962306a36Sopenharmony_ci	percpu_ref_put(&ctx->users);
217062306a36Sopenharmony_ci
217162306a36Sopenharmony_ci	return ret;
217262306a36Sopenharmony_ci}
217362306a36Sopenharmony_ci
217462306a36Sopenharmony_cistatic long do_io_getevents(aio_context_t ctx_id,
217562306a36Sopenharmony_ci		long min_nr,
217662306a36Sopenharmony_ci		long nr,
217762306a36Sopenharmony_ci		struct io_event __user *events,
217862306a36Sopenharmony_ci		struct timespec64 *ts)
217962306a36Sopenharmony_ci{
218062306a36Sopenharmony_ci	ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX;
218162306a36Sopenharmony_ci	struct kioctx *ioctx = lookup_ioctx(ctx_id);
218262306a36Sopenharmony_ci	long ret = -EINVAL;
218362306a36Sopenharmony_ci
218462306a36Sopenharmony_ci	if (likely(ioctx)) {
218562306a36Sopenharmony_ci		if (likely(min_nr <= nr && min_nr >= 0))
218662306a36Sopenharmony_ci			ret = read_events(ioctx, min_nr, nr, events, until);
218762306a36Sopenharmony_ci		percpu_ref_put(&ioctx->users);
218862306a36Sopenharmony_ci	}
218962306a36Sopenharmony_ci
219062306a36Sopenharmony_ci	return ret;
219162306a36Sopenharmony_ci}
219262306a36Sopenharmony_ci
219362306a36Sopenharmony_ci/* io_getevents:
219462306a36Sopenharmony_ci *	Attempts to read at least min_nr events and up to nr events from
219562306a36Sopenharmony_ci *	the completion queue for the aio_context specified by ctx_id. If
219662306a36Sopenharmony_ci *	it succeeds, the number of read events is returned. May fail with
219762306a36Sopenharmony_ci *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
219862306a36Sopenharmony_ci *	out of range, if timeout is out of range.  May fail with -EFAULT
219962306a36Sopenharmony_ci *	if any of the memory specified is invalid.  May return 0 or
220062306a36Sopenharmony_ci *	< min_nr if the timeout specified by timeout has elapsed
220162306a36Sopenharmony_ci *	before sufficient events are available, where timeout == NULL
220262306a36Sopenharmony_ci *	specifies an infinite timeout. Note that the timeout pointed to by
220362306a36Sopenharmony_ci *	timeout is relative.  Will fail with -ENOSYS if not implemented.
220462306a36Sopenharmony_ci */
220562306a36Sopenharmony_ci#ifdef CONFIG_64BIT
220662306a36Sopenharmony_ci
220762306a36Sopenharmony_ciSYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
220862306a36Sopenharmony_ci		long, min_nr,
220962306a36Sopenharmony_ci		long, nr,
221062306a36Sopenharmony_ci		struct io_event __user *, events,
221162306a36Sopenharmony_ci		struct __kernel_timespec __user *, timeout)
221262306a36Sopenharmony_ci{
221362306a36Sopenharmony_ci	struct timespec64	ts;
221462306a36Sopenharmony_ci	int			ret;
221562306a36Sopenharmony_ci
221662306a36Sopenharmony_ci	if (timeout && unlikely(get_timespec64(&ts, timeout)))
221762306a36Sopenharmony_ci		return -EFAULT;
221862306a36Sopenharmony_ci
221962306a36Sopenharmony_ci	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
222062306a36Sopenharmony_ci	if (!ret && signal_pending(current))
222162306a36Sopenharmony_ci		ret = -EINTR;
222262306a36Sopenharmony_ci	return ret;
222362306a36Sopenharmony_ci}
222462306a36Sopenharmony_ci
222562306a36Sopenharmony_ci#endif
222662306a36Sopenharmony_ci
222762306a36Sopenharmony_cistruct __aio_sigset {
222862306a36Sopenharmony_ci	const sigset_t __user	*sigmask;
222962306a36Sopenharmony_ci	size_t		sigsetsize;
223062306a36Sopenharmony_ci};
223162306a36Sopenharmony_ci
223262306a36Sopenharmony_ciSYSCALL_DEFINE6(io_pgetevents,
223362306a36Sopenharmony_ci		aio_context_t, ctx_id,
223462306a36Sopenharmony_ci		long, min_nr,
223562306a36Sopenharmony_ci		long, nr,
223662306a36Sopenharmony_ci		struct io_event __user *, events,
223762306a36Sopenharmony_ci		struct __kernel_timespec __user *, timeout,
223862306a36Sopenharmony_ci		const struct __aio_sigset __user *, usig)
223962306a36Sopenharmony_ci{
224062306a36Sopenharmony_ci	struct __aio_sigset	ksig = { NULL, };
224162306a36Sopenharmony_ci	struct timespec64	ts;
224262306a36Sopenharmony_ci	bool interrupted;
224362306a36Sopenharmony_ci	int ret;
224462306a36Sopenharmony_ci
224562306a36Sopenharmony_ci	if (timeout && unlikely(get_timespec64(&ts, timeout)))
224662306a36Sopenharmony_ci		return -EFAULT;
224762306a36Sopenharmony_ci
224862306a36Sopenharmony_ci	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
224962306a36Sopenharmony_ci		return -EFAULT;
225062306a36Sopenharmony_ci
225162306a36Sopenharmony_ci	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
225262306a36Sopenharmony_ci	if (ret)
225362306a36Sopenharmony_ci		return ret;
225462306a36Sopenharmony_ci
225562306a36Sopenharmony_ci	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
225662306a36Sopenharmony_ci
225762306a36Sopenharmony_ci	interrupted = signal_pending(current);
225862306a36Sopenharmony_ci	restore_saved_sigmask_unless(interrupted);
225962306a36Sopenharmony_ci	if (interrupted && !ret)
226062306a36Sopenharmony_ci		ret = -ERESTARTNOHAND;
226162306a36Sopenharmony_ci
226262306a36Sopenharmony_ci	return ret;
226362306a36Sopenharmony_ci}
226462306a36Sopenharmony_ci
226562306a36Sopenharmony_ci#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
226662306a36Sopenharmony_ci
226762306a36Sopenharmony_ciSYSCALL_DEFINE6(io_pgetevents_time32,
226862306a36Sopenharmony_ci		aio_context_t, ctx_id,
226962306a36Sopenharmony_ci		long, min_nr,
227062306a36Sopenharmony_ci		long, nr,
227162306a36Sopenharmony_ci		struct io_event __user *, events,
227262306a36Sopenharmony_ci		struct old_timespec32 __user *, timeout,
227362306a36Sopenharmony_ci		const struct __aio_sigset __user *, usig)
227462306a36Sopenharmony_ci{
227562306a36Sopenharmony_ci	struct __aio_sigset	ksig = { NULL, };
227662306a36Sopenharmony_ci	struct timespec64	ts;
227762306a36Sopenharmony_ci	bool interrupted;
227862306a36Sopenharmony_ci	int ret;
227962306a36Sopenharmony_ci
228062306a36Sopenharmony_ci	if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
228162306a36Sopenharmony_ci		return -EFAULT;
228262306a36Sopenharmony_ci
228362306a36Sopenharmony_ci	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
228462306a36Sopenharmony_ci		return -EFAULT;
228562306a36Sopenharmony_ci
228662306a36Sopenharmony_ci
228762306a36Sopenharmony_ci	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
228862306a36Sopenharmony_ci	if (ret)
228962306a36Sopenharmony_ci		return ret;
229062306a36Sopenharmony_ci
229162306a36Sopenharmony_ci	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
229262306a36Sopenharmony_ci
229362306a36Sopenharmony_ci	interrupted = signal_pending(current);
229462306a36Sopenharmony_ci	restore_saved_sigmask_unless(interrupted);
229562306a36Sopenharmony_ci	if (interrupted && !ret)
229662306a36Sopenharmony_ci		ret = -ERESTARTNOHAND;
229762306a36Sopenharmony_ci
229862306a36Sopenharmony_ci	return ret;
229962306a36Sopenharmony_ci}
230062306a36Sopenharmony_ci
230162306a36Sopenharmony_ci#endif
230262306a36Sopenharmony_ci
230362306a36Sopenharmony_ci#if defined(CONFIG_COMPAT_32BIT_TIME)
230462306a36Sopenharmony_ci
230562306a36Sopenharmony_ciSYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
230662306a36Sopenharmony_ci		__s32, min_nr,
230762306a36Sopenharmony_ci		__s32, nr,
230862306a36Sopenharmony_ci		struct io_event __user *, events,
230962306a36Sopenharmony_ci		struct old_timespec32 __user *, timeout)
231062306a36Sopenharmony_ci{
231162306a36Sopenharmony_ci	struct timespec64 t;
231262306a36Sopenharmony_ci	int ret;
231362306a36Sopenharmony_ci
231462306a36Sopenharmony_ci	if (timeout && get_old_timespec32(&t, timeout))
231562306a36Sopenharmony_ci		return -EFAULT;
231662306a36Sopenharmony_ci
231762306a36Sopenharmony_ci	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
231862306a36Sopenharmony_ci	if (!ret && signal_pending(current))
231962306a36Sopenharmony_ci		ret = -EINTR;
232062306a36Sopenharmony_ci	return ret;
232162306a36Sopenharmony_ci}
232262306a36Sopenharmony_ci
232362306a36Sopenharmony_ci#endif
232462306a36Sopenharmony_ci
232562306a36Sopenharmony_ci#ifdef CONFIG_COMPAT
232662306a36Sopenharmony_ci
232762306a36Sopenharmony_cistruct __compat_aio_sigset {
232862306a36Sopenharmony_ci	compat_uptr_t		sigmask;
232962306a36Sopenharmony_ci	compat_size_t		sigsetsize;
233062306a36Sopenharmony_ci};
233162306a36Sopenharmony_ci
233262306a36Sopenharmony_ci#if defined(CONFIG_COMPAT_32BIT_TIME)
233362306a36Sopenharmony_ci
233462306a36Sopenharmony_ciCOMPAT_SYSCALL_DEFINE6(io_pgetevents,
233562306a36Sopenharmony_ci		compat_aio_context_t, ctx_id,
233662306a36Sopenharmony_ci		compat_long_t, min_nr,
233762306a36Sopenharmony_ci		compat_long_t, nr,
233862306a36Sopenharmony_ci		struct io_event __user *, events,
233962306a36Sopenharmony_ci		struct old_timespec32 __user *, timeout,
234062306a36Sopenharmony_ci		const struct __compat_aio_sigset __user *, usig)
234162306a36Sopenharmony_ci{
234262306a36Sopenharmony_ci	struct __compat_aio_sigset ksig = { 0, };
234362306a36Sopenharmony_ci	struct timespec64 t;
234462306a36Sopenharmony_ci	bool interrupted;
234562306a36Sopenharmony_ci	int ret;
234662306a36Sopenharmony_ci
234762306a36Sopenharmony_ci	if (timeout && get_old_timespec32(&t, timeout))
234862306a36Sopenharmony_ci		return -EFAULT;
234962306a36Sopenharmony_ci
235062306a36Sopenharmony_ci	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
235162306a36Sopenharmony_ci		return -EFAULT;
235262306a36Sopenharmony_ci
235362306a36Sopenharmony_ci	ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
235462306a36Sopenharmony_ci	if (ret)
235562306a36Sopenharmony_ci		return ret;
235662306a36Sopenharmony_ci
235762306a36Sopenharmony_ci	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
235862306a36Sopenharmony_ci
235962306a36Sopenharmony_ci	interrupted = signal_pending(current);
236062306a36Sopenharmony_ci	restore_saved_sigmask_unless(interrupted);
236162306a36Sopenharmony_ci	if (interrupted && !ret)
236262306a36Sopenharmony_ci		ret = -ERESTARTNOHAND;
236362306a36Sopenharmony_ci
236462306a36Sopenharmony_ci	return ret;
236562306a36Sopenharmony_ci}
236662306a36Sopenharmony_ci
236762306a36Sopenharmony_ci#endif
236862306a36Sopenharmony_ci
236962306a36Sopenharmony_ciCOMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
237062306a36Sopenharmony_ci		compat_aio_context_t, ctx_id,
237162306a36Sopenharmony_ci		compat_long_t, min_nr,
237262306a36Sopenharmony_ci		compat_long_t, nr,
237362306a36Sopenharmony_ci		struct io_event __user *, events,
237462306a36Sopenharmony_ci		struct __kernel_timespec __user *, timeout,
237562306a36Sopenharmony_ci		const struct __compat_aio_sigset __user *, usig)
237662306a36Sopenharmony_ci{
237762306a36Sopenharmony_ci	struct __compat_aio_sigset ksig = { 0, };
237862306a36Sopenharmony_ci	struct timespec64 t;
237962306a36Sopenharmony_ci	bool interrupted;
238062306a36Sopenharmony_ci	int ret;
238162306a36Sopenharmony_ci
238262306a36Sopenharmony_ci	if (timeout && get_timespec64(&t, timeout))
238362306a36Sopenharmony_ci		return -EFAULT;
238462306a36Sopenharmony_ci
238562306a36Sopenharmony_ci	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
238662306a36Sopenharmony_ci		return -EFAULT;
238762306a36Sopenharmony_ci
238862306a36Sopenharmony_ci	ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
238962306a36Sopenharmony_ci	if (ret)
239062306a36Sopenharmony_ci		return ret;
239162306a36Sopenharmony_ci
239262306a36Sopenharmony_ci	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
239362306a36Sopenharmony_ci
239462306a36Sopenharmony_ci	interrupted = signal_pending(current);
239562306a36Sopenharmony_ci	restore_saved_sigmask_unless(interrupted);
239662306a36Sopenharmony_ci	if (interrupted && !ret)
239762306a36Sopenharmony_ci		ret = -ERESTARTNOHAND;
239862306a36Sopenharmony_ci
239962306a36Sopenharmony_ci	return ret;
240062306a36Sopenharmony_ci}
240162306a36Sopenharmony_ci#endif
2402